LLVM 21.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125// Optionally enable cloning of callsites involved with recursive cycles
127 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
128 cl::desc("Allow cloning of callsites involved in recursive cycles"));
129
130// When disabled, try to detect and prevent cloning of recursive contexts.
131// This is only necessary until we support cloning through recursive cycles.
132// Leave on by default for now, as disabling requires a little bit of compile
133// time overhead and doesn't affect correctness, it will just inflate the cold
134// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
136 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137 cl::desc("Allow cloning of contexts through recursive cycles"));
138
139namespace llvm {
141 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
142 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
143
144// Indicate we are linking with an allocator that supports hot/cold operator
145// new interfaces.
147 "supports-hot-cold-new", cl::init(false), cl::Hidden,
148 cl::desc("Linking with hot/cold operator new interfaces"));
149
151 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
152 cl::desc(
153 "Require target function definition when promoting indirect calls"));
154} // namespace llvm
155
158
159namespace {
160/// CRTP base for graphs built from either IR or ThinLTO summary index.
161///
162/// The graph represents the call contexts in all memprof metadata on allocation
163/// calls, with nodes for the allocations themselves, as well as for the calls
164/// in each context. The graph is initially built from the allocation memprof
165/// metadata (or summary) MIBs. It is then updated to match calls with callsite
166/// metadata onto the nodes, updating it to reflect any inlining performed on
167/// those calls.
168///
169/// Each MIB (representing an allocation's call context with allocation
170/// behavior) is assigned a unique context id during the graph build. The edges
171/// and nodes in the graph are decorated with the context ids they carry. This
172/// is used to correctly update the graph when cloning is performed so that we
173/// can uniquify the context for a single (possibly cloned) allocation.
174template <typename DerivedCCG, typename FuncTy, typename CallTy>
175class CallsiteContextGraph {
176public:
177 CallsiteContextGraph() = default;
178 CallsiteContextGraph(const CallsiteContextGraph &) = default;
179 CallsiteContextGraph(CallsiteContextGraph &&) = default;
180
181 /// Main entry point to perform analysis and transformations on graph.
182 bool process();
183
184 /// Perform cloning on the graph necessary to uniquely identify the allocation
185 /// behavior of an allocation based on its context.
186 void identifyClones();
187
188 /// Assign callsite clones to functions, cloning functions as needed to
189 /// accommodate the combinations of their callsite clones reached by callers.
190 /// For regular LTO this clones functions and callsites in the IR, but for
191 /// ThinLTO the cloning decisions are noted in the summaries and later applied
192 /// in applyImport.
193 bool assignFunctions();
194
195 void dump() const;
196 void print(raw_ostream &OS) const;
197 void printTotalSizes(raw_ostream &OS) const;
198
200 const CallsiteContextGraph &CCG) {
201 CCG.print(OS);
202 return OS;
203 }
204
205 friend struct GraphTraits<
206 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
207 friend struct DOTGraphTraits<
208 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
209
210 void exportToDot(std::string Label) const;
211
212 /// Represents a function clone via FuncTy pointer and clone number pair.
213 struct FuncInfo final
214 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
215 using Base = std::pair<FuncTy *, unsigned>;
216 FuncInfo(const Base &B) : Base(B) {}
217 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
218 explicit operator bool() const { return this->first != nullptr; }
219 FuncTy *func() const { return this->first; }
220 unsigned cloneNo() const { return this->second; }
221 };
222
223 /// Represents a callsite clone via CallTy and clone number pair.
224 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
225 using Base = std::pair<CallTy, unsigned>;
226 CallInfo(const Base &B) : Base(B) {}
227 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
228 : Base(Call, CloneNo) {}
229 explicit operator bool() const { return (bool)this->first; }
230 CallTy call() const { return this->first; }
231 unsigned cloneNo() const { return this->second; }
232 void setCloneNo(unsigned N) { this->second = N; }
233 void print(raw_ostream &OS) const {
234 if (!operator bool()) {
235 assert(!cloneNo());
236 OS << "null Call";
237 return;
238 }
239 call()->print(OS);
240 OS << "\t(clone " << cloneNo() << ")";
241 }
242 void dump() const {
243 print(dbgs());
244 dbgs() << "\n";
245 }
246 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
247 Call.print(OS);
248 return OS;
249 }
250 };
251
252 struct ContextEdge;
253
254 /// Node in the Callsite Context Graph
255 struct ContextNode {
256 // Keep this for now since in the IR case where we have an Instruction* it
257 // is not as immediately discoverable. Used for printing richer information
258 // when dumping graph.
259 bool IsAllocation;
260
261 // Keeps track of when the Call was reset to null because there was
262 // recursion.
263 bool Recursive = false;
264
265 // This will be formed by ORing together the AllocationType enum values
266 // for contexts including this node.
267 uint8_t AllocTypes = 0;
268
269 // The corresponding allocation or interior call. This is the primary call
270 // for which we have created this node.
272
273 // List of other calls that can be treated the same as the primary call
274 // through cloning. I.e. located in the same function and have the same
275 // (possibly pruned) stack ids. They will be updated the same way as the
276 // primary call when assigning to function clones.
277 SmallVector<CallInfo, 0> MatchingCalls;
278
279 // For alloc nodes this is a unique id assigned when constructed, and for
280 // callsite stack nodes it is the original stack id when the node is
281 // constructed from the memprof MIB metadata on the alloc nodes. Note that
282 // this is only used when matching callsite metadata onto the stack nodes
283 // created when processing the allocation memprof MIBs, and for labeling
284 // nodes in the dot graph. Therefore we don't bother to assign a value for
285 // clones.
286 uint64_t OrigStackOrAllocId = 0;
287
288 // Edges to all callees in the profiled call stacks.
289 // TODO: Should this be a map (from Callee node) for more efficient lookup?
290 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
291
292 // Edges to all callers in the profiled call stacks.
293 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295
296 // Get the list of edges from which we can compute allocation information
297 // such as the context ids and allocation type of this node.
298 const std::vector<std::shared_ptr<ContextEdge>> *
299 getEdgesWithAllocInfo() const {
300 // If node has any callees, compute from those, otherwise compute from
301 // callers (i.e. if this is the leaf allocation node).
302 if (!CalleeEdges.empty())
303 return &CalleeEdges;
304 // Typically if the callee edges are empty either the caller edges are
305 // also empty, or this is an allocation (leaf node). However, if we are
306 // allowing recursive callsites and contexts this will be violated for
307 // incompletely cloned recursive cycles.
308 assert(CallerEdges.empty() || IsAllocation ||
310 if (!CallerEdges.empty() && IsAllocation)
311 return &CallerEdges;
312 return nullptr;
313 }
314
315 // Compute the context ids for this node from the union of its edge context
316 // ids.
317 DenseSet<uint32_t> getContextIds() const {
318 DenseSet<uint32_t> ContextIds;
319 auto *Edges = getEdgesWithAllocInfo();
320 if (!Edges)
321 return {};
322 unsigned Count = 0;
323 for (auto &Edge : *Edges)
324 Count += Edge->getContextIds().size();
325 ContextIds.reserve(Count);
326 for (auto &Edge : *Edges)
327 ContextIds.insert(Edge->getContextIds().begin(),
328 Edge->getContextIds().end());
329 return ContextIds;
330 }
331
332 // Compute the allocation type for this node from the OR of its edge
333 // allocation types.
334 uint8_t computeAllocType() const {
335 auto *Edges = getEdgesWithAllocInfo();
336 if (!Edges)
337 return (uint8_t)AllocationType::None;
338 uint8_t BothTypes =
339 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
340 uint8_t AllocType = (uint8_t)AllocationType::None;
341 for (auto &Edge : *Edges) {
342 AllocType |= Edge->AllocTypes;
343 // Bail early if alloc type reached both, no further refinement.
344 if (AllocType == BothTypes)
345 return AllocType;
346 }
347 return AllocType;
348 }
349
350 // The context ids set for this node is empty if its edge context ids are
351 // also all empty.
352 bool emptyContextIds() const {
353 auto *Edges = getEdgesWithAllocInfo();
354 if (!Edges)
355 return true;
356 for (auto &Edge : *Edges) {
357 if (!Edge->getContextIds().empty())
358 return false;
359 }
360 return true;
361 }
362
363 // List of clones of this ContextNode, initially empty.
364 std::vector<ContextNode *> Clones;
365
366 // If a clone, points to the original uncloned node.
367 ContextNode *CloneOf = nullptr;
368
369 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
370
371 ContextNode(bool IsAllocation, CallInfo C)
372 : IsAllocation(IsAllocation), Call(C) {}
373
374 void addClone(ContextNode *Clone) {
375 if (CloneOf) {
376 CloneOf->Clones.push_back(Clone);
377 Clone->CloneOf = CloneOf;
378 } else {
379 Clones.push_back(Clone);
380 assert(!Clone->CloneOf);
381 Clone->CloneOf = this;
382 }
383 }
384
385 ContextNode *getOrigNode() {
386 if (!CloneOf)
387 return this;
388 return CloneOf;
389 }
390
391 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
392 unsigned int ContextId);
393
394 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
395 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
396 void eraseCalleeEdge(const ContextEdge *Edge);
397 void eraseCallerEdge(const ContextEdge *Edge);
398
399 void setCall(CallInfo C) { Call = C; }
400
401 bool hasCall() const { return (bool)Call.call(); }
402
403 void printCall(raw_ostream &OS) const { Call.print(OS); }
404
405 // True if this node was effectively removed from the graph, in which case
406 // it should have an allocation type of None and empty context ids.
407 bool isRemoved() const {
408 // Typically if the callee edges are empty either the caller edges are
409 // also empty, or this is an allocation (leaf node). However, if we are
410 // allowing recursive callsites and contexts this will be violated for
411 // incompletely cloned recursive cycles.
413 (AllocTypes == (uint8_t)AllocationType::None) ==
414 emptyContextIds());
415 return AllocTypes == (uint8_t)AllocationType::None;
416 }
417
418 void dump() const;
419 void print(raw_ostream &OS) const;
420
421 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
422 Node.print(OS);
423 return OS;
424 }
425 };
426
427 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
428 /// callee.
429 struct ContextEdge {
430 ContextNode *Callee;
431 ContextNode *Caller;
432
433 // This will be formed by ORing together the AllocationType enum values
434 // for contexts including this edge.
435 uint8_t AllocTypes = 0;
436
437 // The set of IDs for contexts including this edge.
438 DenseSet<uint32_t> ContextIds;
439
440 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
441 DenseSet<uint32_t> ContextIds)
442 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
443 ContextIds(std::move(ContextIds)) {}
444
445 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
446
447 // Helper to clear the fields of this edge when we are removing it from the
448 // graph.
449 inline void clear() {
450 ContextIds.clear();
451 AllocTypes = (uint8_t)AllocationType::None;
452 Caller = nullptr;
453 Callee = nullptr;
454 }
455
456 // Check if edge was removed from the graph. This is useful while iterating
457 // over a copy of edge lists when performing operations that mutate the
458 // graph in ways that might remove one of the edges.
459 inline bool isRemoved() const {
460 if (Callee || Caller)
461 return false;
462 // Any edges that have been removed from the graph but are still in a
463 // shared_ptr somewhere should have all fields null'ed out by clear()
464 // above.
465 assert(AllocTypes == (uint8_t)AllocationType::None);
466 assert(ContextIds.empty());
467 return true;
468 }
469
470 void dump() const;
471 void print(raw_ostream &OS) const;
472
473 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
474 Edge.print(OS);
475 return OS;
476 }
477 };
478
479 /// Helpers to remove edges that have allocation type None (due to not
480 /// carrying any context ids) after transformations.
481 void removeNoneTypeCalleeEdges(ContextNode *Node);
482 void removeNoneTypeCallerEdges(ContextNode *Node);
483 void
484 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
486
487protected:
488 /// Get a list of nodes corresponding to the stack ids in the given callsite
489 /// context.
490 template <class NodeT, class IteratorT>
491 std::vector<uint64_t>
492 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
493
494 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
495 /// metadata (or summary).
496 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
497
498 /// Adds nodes for the given MIB stack ids.
499 template <class NodeT, class IteratorT>
500 void addStackNodesForMIB(ContextNode *AllocNode,
501 CallStack<NodeT, IteratorT> &StackContext,
502 CallStack<NodeT, IteratorT> &CallsiteContext,
504 ArrayRef<ContextTotalSize> ContextSizeInfo);
505
506 /// Matches all callsite metadata (or summary) to the nodes created for
507 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
508 /// inlining performed on those callsite instructions.
509 void updateStackNodes();
510
511 /// Update graph to conservatively handle any callsite stack nodes that target
512 /// multiple different callee target functions.
513 void handleCallsitesWithMultipleTargets();
514
515 // Try to partition calls on the given node (already placed into the AllCalls
516 // array) by callee function, creating new copies of Node as needed to hold
517 // calls with different callees, and moving the callee edges appropriately.
518 // Returns true if partitioning was successful.
519 bool partitionCallsByCallee(
520 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
521 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
522
523 /// Save lists of calls with MemProf metadata in each function, for faster
524 /// iteration.
525 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
526
527 /// Map from callsite node to the enclosing caller function.
528 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
529
530private:
531 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
532
533 // Structure to keep track of information for each call as we are matching
534 // non-allocation callsites onto context nodes created from the allocation
535 // call metadata / summary contexts.
536 struct CallContextInfo {
537 // The callsite we're trying to match.
538 CallTy Call;
539 // The callsites stack ids that have a context node in the graph.
540 std::vector<uint64_t> StackIds;
541 // The function containing this callsite.
542 const FuncTy *Func;
543 // Initially empty, if needed this will be updated to contain the context
544 // ids for use in a new context node created for this callsite.
545 DenseSet<uint32_t> ContextIds;
546 };
547
548 /// Helper to remove edge from graph, updating edge iterator if it is provided
549 /// (in which case CalleeIter indicates which edge list is being iterated).
550 /// This will also perform the necessary clearing of the ContextEdge members
551 /// to enable later checking if the edge has been removed (since we may have
552 /// other copies of the shared_ptr in existence, and in fact rely on this to
553 /// enable removal while iterating over a copy of a node's edge list).
554 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
555 bool CalleeIter = true);
556
557 /// Assigns the given Node to calls at or inlined into the location with
558 /// the Node's stack id, after post order traversing and processing its
559 /// caller nodes. Uses the call information recorded in the given
560 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
561 /// as needed. Called by updateStackNodes which sets up the given
562 /// StackIdToMatchingCalls map.
563 void assignStackNodesPostOrder(
564 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
565 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
566 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
567
568 /// Duplicates the given set of context ids, updating the provided
569 /// map from each original id with the newly generated context ids,
570 /// and returning the new duplicated id set.
571 DenseSet<uint32_t> duplicateContextIds(
572 const DenseSet<uint32_t> &StackSequenceContextIds,
573 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
574
575 /// Propagates all duplicated context ids across the graph.
576 void propagateDuplicateContextIds(
577 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
578
579 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
580 /// else to its callers. Also updates OrigNode's edges to remove any context
581 /// ids moved to the newly created edge.
582 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
583 bool TowardsCallee,
584 DenseSet<uint32_t> RemainingContextIds);
585
586 /// Get the stack id corresponding to the given Id or Index (for IR this will
587 /// return itself, for a summary index this will return the id recorded in the
588 /// index for that stack id index value).
589 uint64_t getStackId(uint64_t IdOrIndex) const {
590 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
591 }
592
593 /// Returns true if the given call targets the callee of the given edge, or if
594 /// we were able to identify the call chain through intermediate tail calls.
595 /// In the latter case new context nodes are added to the graph for the
596 /// identified tail calls, and their synthesized nodes are added to
597 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
598 /// the updated edges and to prepare it for an increment in the caller.
599 bool
600 calleesMatch(CallTy Call, EdgeIter &EI,
601 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
602
603 // Return the callee function of the given call, or nullptr if it can't be
604 // determined
605 const FuncTy *getCalleeFunc(CallTy Call) {
606 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
607 }
608
609 /// Returns true if the given call targets the given function, or if we were
610 /// able to identify the call chain through intermediate tail calls (in which
611 /// case FoundCalleeChain will be populated).
612 bool calleeMatchesFunc(
613 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
614 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
615 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
616 Call, Func, CallerFunc, FoundCalleeChain);
617 }
618
619 /// Returns true if both call instructions have the same callee.
620 bool sameCallee(CallTy Call1, CallTy Call2) {
621 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
622 }
623
624 /// Get a list of nodes corresponding to the stack ids in the given
625 /// callsite's context.
626 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
627 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
628 Call);
629 }
630
631 /// Get the last stack id in the context for callsite.
632 uint64_t getLastStackId(CallTy Call) {
633 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
634 }
635
636 /// Update the allocation call to record type of allocated memory.
637 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
638 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
639 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
640 }
641
642 /// Get the AllocationType assigned to the given allocation instruction clone.
643 AllocationType getAllocationCallType(const CallInfo &Call) const {
644 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
645 }
646
647 /// Update non-allocation call to invoke (possibly cloned) function
648 /// CalleeFunc.
649 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
650 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
651 }
652
653 /// Clone the given function for the given callsite, recording mapping of all
654 /// of the functions tracked calls to their new versions in the CallMap.
655 /// Assigns new clones to clone number CloneNo.
656 FuncInfo cloneFunctionForCallsite(
657 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
658 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
659 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
660 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
661 }
662
663 /// Gets a label to use in the dot graph for the given call clone in the given
664 /// function.
665 std::string getLabel(const FuncTy *Func, const CallTy Call,
666 unsigned CloneNo) const {
667 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
668 }
669
670 // Create and return a new ContextNode.
671 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
672 CallInfo C = CallInfo()) {
673 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
674 auto *NewNode = NodeOwner.back().get();
675 if (F)
676 NodeToCallingFunc[NewNode] = F;
677 return NewNode;
678 }
679
680 /// Helpers to find the node corresponding to the given call or stackid.
681 ContextNode *getNodeForInst(const CallInfo &C);
682 ContextNode *getNodeForAlloc(const CallInfo &C);
683 ContextNode *getNodeForStackId(uint64_t StackId);
684
685 /// Computes the alloc type corresponding to the given context ids, by
686 /// unioning their recorded alloc types.
687 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
688
689 /// Returns the allocation type of the intersection of the contexts of two
690 /// nodes (based on their provided context id sets), optimized for the case
691 /// when Node1Ids is smaller than Node2Ids.
692 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
693 const DenseSet<uint32_t> &Node2Ids) const;
694
695 /// Returns the allocation type of the intersection of the contexts of two
696 /// nodes (based on their provided context id sets).
697 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
698 const DenseSet<uint32_t> &Node2Ids) const;
699
700 /// Create a clone of Edge's callee and move Edge to that new callee node,
701 /// performing the necessary context id and allocation type updates.
702 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
703 /// moved to an edge to the new callee.
704 ContextNode *
705 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
706 DenseSet<uint32_t> ContextIdsToMove = {});
707
708 /// Change the callee of Edge to existing callee clone NewCallee, performing
709 /// the necessary context id and allocation type updates.
710 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
711 /// moved to an edge to the new callee.
712 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
713 ContextNode *NewCallee,
714 bool NewClone = false,
715 DenseSet<uint32_t> ContextIdsToMove = {});
716
717 /// Change the caller of the edge at the given callee edge iterator to be
718 /// NewCaller, performing the necessary context id and allocation type
719 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
720 /// a simplified version of it as we always move the given edge and all of its
721 /// context ids.
722 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
723 ContextNode *NewCaller);
724
725 /// Recursively perform cloning on the graph for the given Node and its
726 /// callers, in order to uniquely identify the allocation behavior of an
727 /// allocation given its context. The context ids of the allocation being
728 /// processed are given in AllocContextIds.
729 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
730 const DenseSet<uint32_t> &AllocContextIds);
731
732 /// Map from each context ID to the AllocationType assigned to that context.
733 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
734
735 /// Map from each contextID to the profiled full contexts and their total
736 /// sizes (there may be more than one due to context trimming),
737 /// optionally populated when requested (via MemProfReportHintedSizes or
738 /// MinClonedColdBytePercent).
739 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
740
741 /// Identifies the context node created for a stack id when adding the MIB
742 /// contexts to the graph. This is used to locate the context nodes when
743 /// trying to assign the corresponding callsites with those stack ids to these
744 /// nodes.
745 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
746
747 /// Maps to track the calls to their corresponding nodes in the graph.
748 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
749 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
750
751 /// Owner of all ContextNode unique_ptrs.
752 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
753
754 /// Perform sanity checks on graph when requested.
755 void check() const;
756
757 /// Keeps track of the last unique context id assigned.
758 unsigned int LastContextId = 0;
759};
760
761template <typename DerivedCCG, typename FuncTy, typename CallTy>
762using ContextNode =
763 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
764template <typename DerivedCCG, typename FuncTy, typename CallTy>
765using ContextEdge =
766 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
767template <typename DerivedCCG, typename FuncTy, typename CallTy>
768using FuncInfo =
769 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
770template <typename DerivedCCG, typename FuncTy, typename CallTy>
771using CallInfo =
772 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
773
774/// CRTP derived class for graphs built from IR (regular LTO).
775class ModuleCallsiteContextGraph
776 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
777 Instruction *> {
778public:
779 ModuleCallsiteContextGraph(
780 Module &M,
782
783private:
784 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
785 Instruction *>;
786
787 uint64_t getStackId(uint64_t IdOrIndex) const;
788 const Function *getCalleeFunc(Instruction *Call);
789 bool calleeMatchesFunc(
790 Instruction *Call, const Function *Func, const Function *CallerFunc,
791 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
792 bool sameCallee(Instruction *Call1, Instruction *Call2);
793 bool findProfiledCalleeThroughTailCalls(
794 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
795 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
796 bool &FoundMultipleCalleeChains);
797 uint64_t getLastStackId(Instruction *Call);
798 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
799 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
800 AllocationType getAllocationCallType(const CallInfo &Call) const;
801 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
802 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
803 Instruction *>::FuncInfo
804 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
805 std::map<CallInfo, CallInfo> &CallMap,
806 std::vector<CallInfo> &CallsWithMetadataInFunc,
807 unsigned CloneNo);
808 std::string getLabel(const Function *Func, const Instruction *Call,
809 unsigned CloneNo) const;
810
811 const Module &Mod;
813};
814
815/// Represents a call in the summary index graph, which can either be an
816/// allocation or an interior callsite node in an allocation's context.
817/// Holds a pointer to the corresponding data structure in the index.
818struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
819 IndexCall() : PointerUnion() {}
820 IndexCall(std::nullptr_t) : IndexCall() {}
822 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
823 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
824
825 IndexCall *operator->() { return this; }
826
827 void print(raw_ostream &OS) const {
829 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
830 OS << *AI;
831 } else {
832 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
833 assert(CI);
834 OS << *CI;
835 }
836 }
837};
838} // namespace
839
840namespace llvm {
841template <> struct simplify_type<IndexCall> {
843 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
844};
845template <> struct simplify_type<const IndexCall> {
847 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
848};
849} // namespace llvm
850
851namespace {
852/// CRTP derived class for graphs built from summary index (ThinLTO).
853class IndexCallsiteContextGraph
854 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
855 IndexCall> {
856public:
857 IndexCallsiteContextGraph(
858 ModuleSummaryIndex &Index,
860 isPrevailing);
861
862 ~IndexCallsiteContextGraph() {
863 // Now that we are done with the graph it is safe to add the new
864 // CallsiteInfo structs to the function summary vectors. The graph nodes
865 // point into locations within these vectors, so we don't want to add them
866 // any earlier.
867 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
868 auto *FS = I.first;
869 for (auto &Callsite : I.second)
870 FS->addCallsite(*Callsite.second);
871 }
872 }
873
874private:
875 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
876 IndexCall>;
877
878 uint64_t getStackId(uint64_t IdOrIndex) const;
879 const FunctionSummary *getCalleeFunc(IndexCall &Call);
880 bool calleeMatchesFunc(
881 IndexCall &Call, const FunctionSummary *Func,
882 const FunctionSummary *CallerFunc,
883 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
884 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
885 bool findProfiledCalleeThroughTailCalls(
886 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
887 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
888 bool &FoundMultipleCalleeChains);
889 uint64_t getLastStackId(IndexCall &Call);
890 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
891 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
892 AllocationType getAllocationCallType(const CallInfo &Call) const;
893 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
894 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
895 IndexCall>::FuncInfo
896 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
897 std::map<CallInfo, CallInfo> &CallMap,
898 std::vector<CallInfo> &CallsWithMetadataInFunc,
899 unsigned CloneNo);
900 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
901 unsigned CloneNo) const;
902
903 // Saves mapping from function summaries containing memprof records back to
904 // its VI, for use in checking and debugging.
905 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
906
909 isPrevailing;
910
911 // Saves/owns the callsite info structures synthesized for missing tail call
912 // frames that we discover while building the graph.
913 // It maps from the summary of the function making the tail call, to a map
914 // of callee ValueInfo to corresponding synthesized callsite info.
915 std::unordered_map<FunctionSummary *,
916 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
917 FunctionCalleesToSynthesizedCallsiteInfos;
918};
919} // namespace
920
921namespace llvm {
922template <>
923struct DenseMapInfo<typename CallsiteContextGraph<
924 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
926template <>
927struct DenseMapInfo<typename CallsiteContextGraph<
928 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
929 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
930template <>
931struct DenseMapInfo<IndexCall>
932 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
933} // end namespace llvm
934
935namespace {
936
937// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
938// type we should actually use on the corresponding allocation.
939// If we can't clone a node that has NotCold+Cold alloc type, we will fall
940// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
941// from NotCold.
942AllocationType allocTypeToUse(uint8_t AllocTypes) {
943 assert(AllocTypes != (uint8_t)AllocationType::None);
944 if (AllocTypes ==
945 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
946 return AllocationType::NotCold;
947 else
948 return (AllocationType)AllocTypes;
949}
950
951// Helper to check if the alloc types for all edges recorded in the
952// InAllocTypes vector match the alloc types for all edges in the Edges
953// vector.
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955bool allocTypesMatch(
956 const std::vector<uint8_t> &InAllocTypes,
957 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
958 &Edges) {
959 // This should be called only when the InAllocTypes vector was computed for
960 // this set of Edges. Make sure the sizes are the same.
961 assert(InAllocTypes.size() == Edges.size());
962 return std::equal(
963 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
964 [](const uint8_t &l,
965 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
966 // Can share if one of the edges is None type - don't
967 // care about the type along that edge as it doesn't
968 // exist for those context ids.
969 if (l == (uint8_t)AllocationType::None ||
970 r->AllocTypes == (uint8_t)AllocationType::None)
971 return true;
972 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
973 });
974}
975
976// Helper to check if the alloc types for all edges recorded in the
977// InAllocTypes vector match the alloc types for callee edges in the given
978// clone. Because the InAllocTypes were computed from the original node's callee
979// edges, and other cloning could have happened after this clone was created, we
980// need to find the matching clone callee edge, which may or may not exist.
981template <typename DerivedCCG, typename FuncTy, typename CallTy>
982bool allocTypesMatchClone(
983 const std::vector<uint8_t> &InAllocTypes,
984 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
985 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
986 assert(Node);
987 // InAllocTypes should have been computed for the original node's callee
988 // edges.
989 assert(InAllocTypes.size() == Node->CalleeEdges.size());
990 // First create a map of the clone callee edge callees to the edge alloc type.
992 EdgeCalleeMap;
993 for (const auto &E : Clone->CalleeEdges) {
994 assert(!EdgeCalleeMap.contains(E->Callee));
995 EdgeCalleeMap[E->Callee] = E->AllocTypes;
996 }
997 // Next, walk the original node's callees, and look for the corresponding
998 // clone edge to that callee.
999 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1000 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1001 // Not found is ok, we will simply add an edge if we use this clone.
1002 if (Iter == EdgeCalleeMap.end())
1003 continue;
1004 // Can share if one of the edges is None type - don't
1005 // care about the type along that edge as it doesn't
1006 // exist for those context ids.
1007 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1008 Iter->second == (uint8_t)AllocationType::None)
1009 continue;
1010 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1011 return false;
1012 }
1013 return true;
1014}
1015
1016} // end anonymous namespace
1017
1018template <typename DerivedCCG, typename FuncTy, typename CallTy>
1019typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1020CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1021 const CallInfo &C) {
1022 ContextNode *Node = getNodeForAlloc(C);
1023 if (Node)
1024 return Node;
1025
1026 return NonAllocationCallToContextNodeMap.lookup(C);
1027}
1028
1029template <typename DerivedCCG, typename FuncTy, typename CallTy>
1030typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1031CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1032 const CallInfo &C) {
1033 return AllocationCallToContextNodeMap.lookup(C);
1034}
1035
1036template <typename DerivedCCG, typename FuncTy, typename CallTy>
1037typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1038CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1039 uint64_t StackId) {
1040 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1041 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1042 return StackEntryNode->second;
1043 return nullptr;
1044}
1045
1046template <typename DerivedCCG, typename FuncTy, typename CallTy>
1047void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1048 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1049 unsigned int ContextId) {
1050 for (auto &Edge : CallerEdges) {
1051 if (Edge->Caller == Caller) {
1052 Edge->AllocTypes |= (uint8_t)AllocType;
1053 Edge->getContextIds().insert(ContextId);
1054 return;
1055 }
1056 }
1057 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1058 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1059 CallerEdges.push_back(Edge);
1060 Caller->CalleeEdges.push_back(Edge);
1061}
1062
1063template <typename DerivedCCG, typename FuncTy, typename CallTy>
1064void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1065 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1066 assert(!EI || (*EI)->get() == Edge);
1067 assert(!Edge->isRemoved());
1068 // Save the Caller and Callee pointers so we can erase Edge from their edge
1069 // lists after clearing Edge below. We do the clearing first in case it is
1070 // destructed after removing from the edge lists (if those were the last
1071 // shared_ptr references to Edge).
1072 auto *Callee = Edge->Callee;
1073 auto *Caller = Edge->Caller;
1074
1075 // Make sure the edge fields are cleared out so we can properly detect
1076 // removed edges if Edge is not destructed because there is still a shared_ptr
1077 // reference.
1078 Edge->clear();
1079
1080#ifndef NDEBUG
1081 auto CalleeCallerCount = Callee->CallerEdges.size();
1082 auto CallerCalleeCount = Caller->CalleeEdges.size();
1083#endif
1084 if (!EI) {
1085 Callee->eraseCallerEdge(Edge);
1086 Caller->eraseCalleeEdge(Edge);
1087 } else if (CalleeIter) {
1088 Callee->eraseCallerEdge(Edge);
1089 *EI = Caller->CalleeEdges.erase(*EI);
1090 } else {
1091 Caller->eraseCalleeEdge(Edge);
1092 *EI = Callee->CallerEdges.erase(*EI);
1093 }
1094 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1095 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1096}
1097
1098template <typename DerivedCCG, typename FuncTy, typename CallTy>
1099void CallsiteContextGraph<
1100 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1101 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1102 auto Edge = *EI;
1103 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1104 assert(Edge->ContextIds.empty());
1105 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1106 } else
1107 ++EI;
1108 }
1109}
1110
1111template <typename DerivedCCG, typename FuncTy, typename CallTy>
1112void CallsiteContextGraph<
1113 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1114 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1115 auto Edge = *EI;
1116 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1117 assert(Edge->ContextIds.empty());
1118 Edge->Caller->eraseCalleeEdge(Edge.get());
1119 EI = Node->CallerEdges.erase(EI);
1120 } else
1121 ++EI;
1122 }
1123}
1124
1125template <typename DerivedCCG, typename FuncTy, typename CallTy>
1126typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1127CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1128 findEdgeFromCallee(const ContextNode *Callee) {
1129 for (const auto &Edge : CalleeEdges)
1130 if (Edge->Callee == Callee)
1131 return Edge.get();
1132 return nullptr;
1133}
1134
1135template <typename DerivedCCG, typename FuncTy, typename CallTy>
1136typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1137CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1138 findEdgeFromCaller(const ContextNode *Caller) {
1139 for (const auto &Edge : CallerEdges)
1140 if (Edge->Caller == Caller)
1141 return Edge.get();
1142 return nullptr;
1143}
1144
1145template <typename DerivedCCG, typename FuncTy, typename CallTy>
1146void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1147 eraseCalleeEdge(const ContextEdge *Edge) {
1148 auto EI = llvm::find_if(
1149 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1150 return CalleeEdge.get() == Edge;
1151 });
1152 assert(EI != CalleeEdges.end());
1153 CalleeEdges.erase(EI);
1154}
1155
1156template <typename DerivedCCG, typename FuncTy, typename CallTy>
1157void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1158 eraseCallerEdge(const ContextEdge *Edge) {
1159 auto EI = llvm::find_if(
1160 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1161 return CallerEdge.get() == Edge;
1162 });
1163 assert(EI != CallerEdges.end());
1164 CallerEdges.erase(EI);
1165}
1166
1167template <typename DerivedCCG, typename FuncTy, typename CallTy>
1168uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1169 DenseSet<uint32_t> &ContextIds) const {
1170 uint8_t BothTypes =
1171 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1172 uint8_t AllocType = (uint8_t)AllocationType::None;
1173 for (auto Id : ContextIds) {
1174 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1175 // Bail early if alloc type reached both, no further refinement.
1176 if (AllocType == BothTypes)
1177 return AllocType;
1178 }
1179 return AllocType;
1180}
1181
1182template <typename DerivedCCG, typename FuncTy, typename CallTy>
1183uint8_t
1184CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1185 const DenseSet<uint32_t> &Node1Ids,
1186 const DenseSet<uint32_t> &Node2Ids) const {
1187 uint8_t BothTypes =
1188 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1189 uint8_t AllocType = (uint8_t)AllocationType::None;
1190 for (auto Id : Node1Ids) {
1191 if (!Node2Ids.count(Id))
1192 continue;
1193 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1194 // Bail early if alloc type reached both, no further refinement.
1195 if (AllocType == BothTypes)
1196 return AllocType;
1197 }
1198 return AllocType;
1199}
1200
1201template <typename DerivedCCG, typename FuncTy, typename CallTy>
1202uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1203 const DenseSet<uint32_t> &Node1Ids,
1204 const DenseSet<uint32_t> &Node2Ids) const {
1205 if (Node1Ids.size() < Node2Ids.size())
1206 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1207 else
1208 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1209}
1210
1211template <typename DerivedCCG, typename FuncTy, typename CallTy>
1212typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1213CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1214 CallInfo Call, const FuncTy *F) {
1215 assert(!getNodeForAlloc(Call));
1216 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1217 AllocationCallToContextNodeMap[Call] = AllocNode;
1218 // Use LastContextId as a uniq id for MIB allocation nodes.
1219 AllocNode->OrigStackOrAllocId = LastContextId;
1220 // Alloc type should be updated as we add in the MIBs. We should assert
1221 // afterwards that it is not still None.
1222 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1223
1224 return AllocNode;
1225}
1226
1227static std::string getAllocTypeString(uint8_t AllocTypes) {
1228 if (!AllocTypes)
1229 return "None";
1230 std::string Str;
1231 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1232 Str += "NotCold";
1233 if (AllocTypes & (uint8_t)AllocationType::Cold)
1234 Str += "Cold";
1235 return Str;
1236}
1237
1238template <typename DerivedCCG, typename FuncTy, typename CallTy>
1239template <class NodeT, class IteratorT>
1240void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1241 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1243 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1244 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1245 // is done.
1246 if (AllocType == AllocationType::Hot)
1247 AllocType = AllocationType::NotCold;
1248
1249 ContextIdToAllocationType[++LastContextId] = AllocType;
1250
1251 if (!ContextSizeInfo.empty()) {
1252 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1253 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1254 }
1255
1256 // Update alloc type and context ids for this MIB.
1257 AllocNode->AllocTypes |= (uint8_t)AllocType;
1258
1259 // Now add or update nodes for each stack id in alloc's context.
1260 // Later when processing the stack ids on non-alloc callsites we will adjust
1261 // for any inlining in the context.
1262 ContextNode *PrevNode = AllocNode;
1263 // Look for recursion (direct recursion should have been collapsed by
1264 // module summary analysis, here we should just be detecting mutual
1265 // recursion). Mark these nodes so we don't try to clone.
1266 SmallSet<uint64_t, 8> StackIdSet;
1267 // Skip any on the allocation call (inlining).
1268 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1269 ContextIter != StackContext.end(); ++ContextIter) {
1270 auto StackId = getStackId(*ContextIter);
1271 ContextNode *StackNode = getNodeForStackId(StackId);
1272 if (!StackNode) {
1273 StackNode = createNewNode(/*IsAllocation=*/false);
1274 StackEntryIdToContextNodeMap[StackId] = StackNode;
1275 StackNode->OrigStackOrAllocId = StackId;
1276 }
1277 // Marking a node recursive will prevent its cloning completely, even for
1278 // non-recursive contexts flowing through it.
1280 auto Ins = StackIdSet.insert(StackId);
1281 if (!Ins.second)
1282 StackNode->Recursive = true;
1283 }
1284 StackNode->AllocTypes |= (uint8_t)AllocType;
1285 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1286 PrevNode = StackNode;
1287 }
1288}
1289
1290template <typename DerivedCCG, typename FuncTy, typename CallTy>
1292CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1293 const DenseSet<uint32_t> &StackSequenceContextIds,
1294 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1295 DenseSet<uint32_t> NewContextIds;
1296 for (auto OldId : StackSequenceContextIds) {
1297 NewContextIds.insert(++LastContextId);
1298 OldToNewContextIds[OldId].insert(LastContextId);
1299 assert(ContextIdToAllocationType.count(OldId));
1300 // The new context has the same allocation type as original.
1301 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1302 }
1303 return NewContextIds;
1304}
1305
1306template <typename DerivedCCG, typename FuncTy, typename CallTy>
1307void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1308 propagateDuplicateContextIds(
1309 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1310 // Build a set of duplicated context ids corresponding to the input id set.
1311 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1312 DenseSet<uint32_t> NewIds;
1313 for (auto Id : ContextIds)
1314 if (auto NewId = OldToNewContextIds.find(Id);
1315 NewId != OldToNewContextIds.end())
1316 NewIds.insert(NewId->second.begin(), NewId->second.end());
1317 return NewIds;
1318 };
1319
1320 // Recursively update context ids sets along caller edges.
1321 auto UpdateCallers = [&](ContextNode *Node,
1323 auto &&UpdateCallers) -> void {
1324 for (const auto &Edge : Node->CallerEdges) {
1325 auto Inserted = Visited.insert(Edge.get());
1326 if (!Inserted.second)
1327 continue;
1328 ContextNode *NextNode = Edge->Caller;
1329 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1330 // Only need to recursively iterate to NextNode via this caller edge if
1331 // it resulted in any added ids to NextNode.
1332 if (!NewIdsToAdd.empty()) {
1333 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1334 UpdateCallers(NextNode, Visited, UpdateCallers);
1335 }
1336 }
1337 };
1338
1340 for (auto &Entry : AllocationCallToContextNodeMap) {
1341 auto *Node = Entry.second;
1342 UpdateCallers(Node, Visited, UpdateCallers);
1343 }
1344}
1345
1346template <typename DerivedCCG, typename FuncTy, typename CallTy>
1347void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1348 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1349 // This must be passed by value to make a copy since it will be adjusted
1350 // as ids are moved.
1351 DenseSet<uint32_t> RemainingContextIds) {
1352 auto &OrigEdges =
1353 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1354 DenseSet<uint32_t> RecursiveContextIds;
1355 DenseSet<uint32_t> AllCallerContextIds;
1357 // Identify which context ids are recursive which is needed to properly
1358 // update the RemainingContextIds set. The relevant recursive context ids
1359 // are those that are in multiple edges.
1360 for (auto &CE : OrigEdges) {
1361 AllCallerContextIds.reserve(CE->getContextIds().size());
1362 for (auto Id : CE->getContextIds())
1363 if (!AllCallerContextIds.insert(Id).second)
1364 RecursiveContextIds.insert(Id);
1365 }
1366 }
1367 // Increment iterator in loop so that we can remove edges as needed.
1368 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1369 auto Edge = *EI;
1370 DenseSet<uint32_t> NewEdgeContextIds;
1371 DenseSet<uint32_t> NotFoundContextIds;
1372 // Remove any matching context ids from Edge, return set that were found and
1373 // removed, these are the new edge's context ids. Also update the remaining
1374 // (not found ids).
1375 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1376 NotFoundContextIds);
1377 // Update the remaining context ids set for the later edges. This is a
1378 // compile time optimization.
1379 if (RecursiveContextIds.empty()) {
1380 // No recursive ids, so all of the previously remaining context ids that
1381 // were not seen on this edge are the new remaining set.
1382 RemainingContextIds.swap(NotFoundContextIds);
1383 } else {
1384 // Keep the recursive ids in the remaining set as we expect to see those
1385 // on another edge. We can remove the non-recursive remaining ids that
1386 // were seen on this edge, however. We already have the set of remaining
1387 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1388 // non-recursive and only remove those. Note that despite the higher
1389 // overhead of updating the remaining context ids set when recursion
1390 // handling is enabled, it was found to be at worst performance neutral
1391 // and in one case a clear win.
1392 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1393 set_difference(NewEdgeContextIds, RecursiveContextIds);
1394 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1395 }
1396 // If no matching context ids for this edge, skip it.
1397 if (NewEdgeContextIds.empty()) {
1398 ++EI;
1399 continue;
1400 }
1401 if (TowardsCallee) {
1402 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1403 auto NewEdge = std::make_shared<ContextEdge>(
1404 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1405 NewNode->CalleeEdges.push_back(NewEdge);
1406 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1407 } else {
1408 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1409 auto NewEdge = std::make_shared<ContextEdge>(
1410 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1411 NewNode->CallerEdges.push_back(NewEdge);
1412 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1413 }
1414 // Remove old edge if context ids empty.
1415 if (Edge->getContextIds().empty()) {
1416 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1417 continue;
1418 }
1419 ++EI;
1420 }
1421}
1422
1423template <typename DerivedCCG, typename FuncTy, typename CallTy>
1424static void checkEdge(
1425 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1426 // Confirm that alloc type is not None and that we have at least one context
1427 // id.
1428 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1429 assert(!Edge->ContextIds.empty());
1430}
1431
1432template <typename DerivedCCG, typename FuncTy, typename CallTy>
1433static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1434 bool CheckEdges = true) {
1435 if (Node->isRemoved())
1436 return;
1437#ifndef NDEBUG
1438 // Compute node's context ids once for use in asserts.
1439 auto NodeContextIds = Node->getContextIds();
1440#endif
1441 // Node's context ids should be the union of both its callee and caller edge
1442 // context ids.
1443 if (Node->CallerEdges.size()) {
1444 DenseSet<uint32_t> CallerEdgeContextIds(
1445 Node->CallerEdges.front()->ContextIds);
1446 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1447 if (CheckEdges)
1448 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1449 set_union(CallerEdgeContextIds, Edge->ContextIds);
1450 }
1451 // Node can have more context ids than callers if some contexts terminate at
1452 // node and some are longer. If we are allowing recursive callsites and
1453 // contexts this will be violated for incompletely cloned recursive cycles,
1454 // so skip the checking in that case.
1456 NodeContextIds == CallerEdgeContextIds ||
1457 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1458 }
1459 if (Node->CalleeEdges.size()) {
1460 DenseSet<uint32_t> CalleeEdgeContextIds(
1461 Node->CalleeEdges.front()->ContextIds);
1462 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1463 if (CheckEdges)
1464 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1465 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1466 }
1467 // If we are allowing recursive callsites and contexts this will be violated
1468 // for incompletely cloned recursive cycles, so skip the checking in that
1469 // case.
1471 NodeContextIds == CalleeEdgeContextIds);
1472 }
1473 // FIXME: Since this checking is only invoked under an option, we should
1474 // change the error checking from using assert to something that will trigger
1475 // an error on a release build.
1476#ifndef NDEBUG
1477 // Make sure we don't end up with duplicate edges between the same caller and
1478 // callee.
1480 for (const auto &E : Node->CalleeEdges)
1481 NodeSet.insert(E->Callee);
1482 assert(NodeSet.size() == Node->CalleeEdges.size());
1483#endif
1484}
1485
1486template <typename DerivedCCG, typename FuncTy, typename CallTy>
1487void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1488 assignStackNodesPostOrder(
1489 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1490 DenseMap<uint64_t, std::vector<CallContextInfo>>
1491 &StackIdToMatchingCalls,
1492 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1493 auto Inserted = Visited.insert(Node);
1494 if (!Inserted.second)
1495 return;
1496 // Post order traversal. Iterate over a copy since we may add nodes and
1497 // therefore new callers during the recursive call, invalidating any
1498 // iterator over the original edge vector. We don't need to process these
1499 // new nodes as they were already processed on creation.
1500 auto CallerEdges = Node->CallerEdges;
1501 for (auto &Edge : CallerEdges) {
1502 // Skip any that have been removed during the recursion.
1503 if (Edge->isRemoved()) {
1504 assert(!is_contained(Node->CallerEdges, Edge));
1505 continue;
1506 }
1507 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1508 CallToMatchingCall);
1509 }
1510
1511 // If this node's stack id is in the map, update the graph to contain new
1512 // nodes representing any inlining at interior callsites. Note we move the
1513 // associated context ids over to the new nodes.
1514
1515 // Ignore this node if it is for an allocation or we didn't record any
1516 // stack id lists ending at it.
1517 if (Node->IsAllocation ||
1518 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1519 return;
1520
1521 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1522 // Handle the simple case first. A single call with a single stack id.
1523 // In this case there is no need to create any new context nodes, simply
1524 // assign the context node for stack id to this Call.
1525 if (Calls.size() == 1) {
1526 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1527 if (Ids.size() == 1) {
1528 assert(SavedContextIds.empty());
1529 // It should be this Node
1530 assert(Node == getNodeForStackId(Ids[0]));
1531 if (Node->Recursive)
1532 return;
1533 Node->setCall(Call);
1534 NonAllocationCallToContextNodeMap[Call] = Node;
1535 NodeToCallingFunc[Node] = Func;
1536 return;
1537 }
1538 }
1539
1540#ifndef NDEBUG
1541 // Find the node for the last stack id, which should be the same
1542 // across all calls recorded for this id, and is this node's id.
1543 uint64_t LastId = Node->OrigStackOrAllocId;
1544 ContextNode *LastNode = getNodeForStackId(LastId);
1545 // We should only have kept stack ids that had nodes.
1546 assert(LastNode);
1547 assert(LastNode == Node);
1548#else
1549 ContextNode *LastNode = Node;
1550#endif
1551
1552 // Compute the last node's context ids once, as it is shared by all calls in
1553 // this entry.
1554 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1555
1556 bool PrevIterCreatedNode = false;
1557 bool CreatedNode = false;
1558 for (unsigned I = 0; I < Calls.size();
1559 I++, PrevIterCreatedNode = CreatedNode) {
1560 CreatedNode = false;
1561 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1562 // Skip any for which we didn't assign any ids, these don't get a node in
1563 // the graph.
1564 if (SavedContextIds.empty()) {
1565 // If this call has a matching call (located in the same function and
1566 // having the same stack ids), simply add it to the context node created
1567 // for its matching call earlier. These can be treated the same through
1568 // cloning and get updated at the same time.
1569 if (!CallToMatchingCall.contains(Call))
1570 continue;
1571 auto MatchingCall = CallToMatchingCall[Call];
1572 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1573 // This should only happen if we had a prior iteration, and it didn't
1574 // create a node because of the below recomputation of context ids
1575 // finding none remaining and continuing early.
1576 assert(I > 0 && !PrevIterCreatedNode);
1577 continue;
1578 }
1579 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1580 Call);
1581 continue;
1582 }
1583
1584 assert(LastId == Ids.back());
1585
1586 // Recompute the context ids for this stack id sequence (the
1587 // intersection of the context ids of the corresponding nodes).
1588 // Start with the ids we saved in the map for this call, which could be
1589 // duplicated context ids. We have to recompute as we might have overlap
1590 // overlap between the saved context ids for different last nodes, and
1591 // removed them already during the post order traversal.
1592 set_intersect(SavedContextIds, LastNodeContextIds);
1593 ContextNode *PrevNode = LastNode;
1594 bool Skip = false;
1595 // Iterate backwards through the stack Ids, starting after the last Id
1596 // in the list, which was handled once outside for all Calls.
1597 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1598 auto Id = *IdIter;
1599 ContextNode *CurNode = getNodeForStackId(Id);
1600 // We should only have kept stack ids that had nodes and weren't
1601 // recursive.
1602 assert(CurNode);
1603 assert(!CurNode->Recursive);
1604
1605 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1606 if (!Edge) {
1607 Skip = true;
1608 break;
1609 }
1610 PrevNode = CurNode;
1611
1612 // Update the context ids, which is the intersection of the ids along
1613 // all edges in the sequence.
1614 set_intersect(SavedContextIds, Edge->getContextIds());
1615
1616 // If we now have no context ids for clone, skip this call.
1617 if (SavedContextIds.empty()) {
1618 Skip = true;
1619 break;
1620 }
1621 }
1622 if (Skip)
1623 continue;
1624
1625 // Create new context node.
1626 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1627 NonAllocationCallToContextNodeMap[Call] = NewNode;
1628 CreatedNode = true;
1629 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1630
1631 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1632 assert(FirstNode);
1633
1634 // Connect to callees of innermost stack frame in inlined call chain.
1635 // This updates context ids for FirstNode's callee's to reflect those
1636 // moved to NewNode.
1637 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1638
1639 // Connect to callers of outermost stack frame in inlined call chain.
1640 // This updates context ids for FirstNode's caller's to reflect those
1641 // moved to NewNode.
1642 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1643
1644 // Now we need to remove context ids from edges/nodes between First and
1645 // Last Node.
1646 PrevNode = nullptr;
1647 for (auto Id : Ids) {
1648 ContextNode *CurNode = getNodeForStackId(Id);
1649 // We should only have kept stack ids that had nodes.
1650 assert(CurNode);
1651
1652 // Remove the context ids moved to NewNode from CurNode, and the
1653 // edge from the prior node.
1654 if (PrevNode) {
1655 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1656 assert(PrevEdge);
1657 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1658 if (PrevEdge->getContextIds().empty())
1659 removeEdgeFromGraph(PrevEdge);
1660 }
1661 // Since we update the edges from leaf to tail, only look at the callee
1662 // edges. This isn't an alloc node, so if there are no callee edges, the
1663 // alloc type is None.
1664 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1665 ? (uint8_t)AllocationType::None
1666 : CurNode->computeAllocType();
1667 PrevNode = CurNode;
1668 }
1669 if (VerifyNodes) {
1670 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1671 for (auto Id : Ids) {
1672 ContextNode *CurNode = getNodeForStackId(Id);
1673 // We should only have kept stack ids that had nodes.
1674 assert(CurNode);
1675 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1676 }
1677 }
1678 }
1679}
1680
1681template <typename DerivedCCG, typename FuncTy, typename CallTy>
1682void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1683 // Map of stack id to all calls with that as the last (outermost caller)
1684 // callsite id that has a context node (some might not due to pruning
1685 // performed during matching of the allocation profile contexts).
1686 // The CallContextInfo contains the Call and a list of its stack ids with
1687 // ContextNodes, the function containing Call, and the set of context ids
1688 // the analysis will eventually identify for use in any new node created
1689 // for that callsite.
1691 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1692 for (auto &Call : CallsWithMetadata) {
1693 // Ignore allocations, already handled.
1694 if (AllocationCallToContextNodeMap.count(Call))
1695 continue;
1696 auto StackIdsWithContextNodes =
1697 getStackIdsWithContextNodesForCall(Call.call());
1698 // If there were no nodes created for MIBs on allocs (maybe this was in
1699 // the unambiguous part of the MIB stack that was pruned), ignore.
1700 if (StackIdsWithContextNodes.empty())
1701 continue;
1702 // Otherwise, record this Call along with the list of ids for the last
1703 // (outermost caller) stack id with a node.
1704 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1705 {Call.call(), StackIdsWithContextNodes, Func, {}});
1706 }
1707 }
1708
1709 // First make a pass through all stack ids that correspond to a call,
1710 // as identified in the above loop. Compute the context ids corresponding to
1711 // each of these calls when they correspond to multiple stack ids due to
1712 // due to inlining. Perform any duplication of context ids required when
1713 // there is more than one call with the same stack ids. Their (possibly newly
1714 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1715 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1716 // Save a map from each call to any that are found to match it. I.e. located
1717 // in the same function and have the same (possibly pruned) stack ids. We use
1718 // this to avoid creating extra graph nodes as they can be treated the same.
1719 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1720 for (auto &It : StackIdToMatchingCalls) {
1721 auto &Calls = It.getSecond();
1722 // Skip single calls with a single stack id. These don't need a new node.
1723 if (Calls.size() == 1) {
1724 auto &Ids = Calls[0].StackIds;
1725 if (Ids.size() == 1)
1726 continue;
1727 }
1728 // In order to do the best and maximal matching of inlined calls to context
1729 // node sequences we will sort the vectors of stack ids in descending order
1730 // of length, and within each length, lexicographically by stack id. The
1731 // latter is so that we can specially handle calls that have identical stack
1732 // id sequences (either due to cloning or artificially because of the MIB
1733 // context pruning). Those with the same Ids are then sorted by function to
1734 // facilitate efficiently mapping them to the same context node.
1735 // Because the functions are pointers, to ensure a stable sort first assign
1736 // each function pointer to its first index in the Calls array, and then use
1737 // that to sort by.
1739 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1740 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1741 std::stable_sort(
1742 Calls.begin(), Calls.end(),
1743 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1744 return A.StackIds.size() > B.StackIds.size() ||
1745 (A.StackIds.size() == B.StackIds.size() &&
1746 (A.StackIds < B.StackIds ||
1747 (A.StackIds == B.StackIds &&
1748 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1749 });
1750
1751 // Find the node for the last stack id, which should be the same
1752 // across all calls recorded for this id, and is the id for this
1753 // entry in the StackIdToMatchingCalls map.
1754 uint64_t LastId = It.getFirst();
1755 ContextNode *LastNode = getNodeForStackId(LastId);
1756 // We should only have kept stack ids that had nodes.
1757 assert(LastNode);
1758
1759 if (LastNode->Recursive)
1760 continue;
1761
1762 // Initialize the context ids with the last node's. We will subsequently
1763 // refine the context ids by computing the intersection along all edges.
1764 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1765 assert(!LastNodeContextIds.empty());
1766
1767#ifndef NDEBUG
1768 // Save the set of functions seen for a particular set of the same stack
1769 // ids. This is used to ensure that they have been correctly sorted to be
1770 // adjacent in the Calls list, since we rely on that to efficiently place
1771 // all such matching calls onto the same context node.
1772 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1773#endif
1774
1775 for (unsigned I = 0; I < Calls.size(); I++) {
1776 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1777 assert(SavedContextIds.empty());
1778 assert(LastId == Ids.back());
1779
1780#ifndef NDEBUG
1781 // If this call has a different set of ids than the last one, clear the
1782 // set used to ensure they are sorted properly.
1783 if (I > 0 && Ids != Calls[I - 1].StackIds)
1784 MatchingIdsFuncSet.clear();
1785#endif
1786
1787 // First compute the context ids for this stack id sequence (the
1788 // intersection of the context ids of the corresponding nodes).
1789 // Start with the remaining saved ids for the last node.
1790 assert(!LastNodeContextIds.empty());
1791 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1792
1793 ContextNode *PrevNode = LastNode;
1794 ContextNode *CurNode = LastNode;
1795 bool Skip = false;
1796
1797 // Iterate backwards through the stack Ids, starting after the last Id
1798 // in the list, which was handled once outside for all Calls.
1799 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1800 auto Id = *IdIter;
1801 CurNode = getNodeForStackId(Id);
1802 // We should only have kept stack ids that had nodes.
1803 assert(CurNode);
1804
1805 if (CurNode->Recursive) {
1806 Skip = true;
1807 break;
1808 }
1809
1810 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1811 // If there is no edge then the nodes belong to different MIB contexts,
1812 // and we should skip this inlined context sequence. For example, this
1813 // particular inlined context may include stack ids A->B, and we may
1814 // indeed have nodes for both A and B, but it is possible that they were
1815 // never profiled in sequence in a single MIB for any allocation (i.e.
1816 // we might have profiled an allocation that involves the callsite A,
1817 // but through a different one of its callee callsites, and we might
1818 // have profiled an allocation that involves callsite B, but reached
1819 // from a different caller callsite).
1820 if (!Edge) {
1821 Skip = true;
1822 break;
1823 }
1824 PrevNode = CurNode;
1825
1826 // Update the context ids, which is the intersection of the ids along
1827 // all edges in the sequence.
1828 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1829
1830 // If we now have no context ids for clone, skip this call.
1831 if (StackSequenceContextIds.empty()) {
1832 Skip = true;
1833 break;
1834 }
1835 }
1836 if (Skip)
1837 continue;
1838
1839 // If some of this call's stack ids did not have corresponding nodes (due
1840 // to pruning), don't include any context ids for contexts that extend
1841 // beyond these nodes. Otherwise we would be matching part of unrelated /
1842 // not fully matching stack contexts. To do this, subtract any context ids
1843 // found in caller nodes of the last node found above.
1844 if (Ids.back() != getLastStackId(Call)) {
1845 for (const auto &PE : LastNode->CallerEdges) {
1846 set_subtract(StackSequenceContextIds, PE->getContextIds());
1847 if (StackSequenceContextIds.empty())
1848 break;
1849 }
1850 // If we now have no context ids for clone, skip this call.
1851 if (StackSequenceContextIds.empty())
1852 continue;
1853 }
1854
1855#ifndef NDEBUG
1856 // If the prior call had the same stack ids this set would not be empty.
1857 // Check if we already have a call that "matches" because it is located
1858 // in the same function. If the Calls list was sorted properly we should
1859 // not encounter this situation as all such entries should be adjacent
1860 // and processed in bulk further below.
1861 assert(!MatchingIdsFuncSet.contains(Func));
1862
1863 MatchingIdsFuncSet.insert(Func);
1864#endif
1865
1866 // Check if the next set of stack ids is the same (since the Calls vector
1867 // of tuples is sorted by the stack ids we can just look at the next one).
1868 // If so, save them in the CallToMatchingCall map so that they get
1869 // assigned to the same context node, and skip them.
1870 bool DuplicateContextIds = false;
1871 for (unsigned J = I + 1; J < Calls.size(); J++) {
1872 auto &CallCtxInfo = Calls[J];
1873 auto &NextIds = CallCtxInfo.StackIds;
1874 if (NextIds != Ids)
1875 break;
1876 auto *NextFunc = CallCtxInfo.Func;
1877 if (NextFunc != Func) {
1878 // We have another Call with the same ids but that cannot share this
1879 // node, must duplicate ids for it.
1880 DuplicateContextIds = true;
1881 break;
1882 }
1883 auto &NextCall = CallCtxInfo.Call;
1884 CallToMatchingCall[NextCall] = Call;
1885 // Update I so that it gets incremented correctly to skip this call.
1886 I = J;
1887 }
1888
1889 // If we don't have duplicate context ids, then we can assign all the
1890 // context ids computed for the original node sequence to this call.
1891 // If there are duplicate calls with the same stack ids then we synthesize
1892 // new context ids that are duplicates of the originals. These are
1893 // assigned to SavedContextIds, which is a reference into the map entry
1894 // for this call, allowing us to access these ids later on.
1895 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1896 StackSequenceContextIds.size());
1897 SavedContextIds =
1898 DuplicateContextIds
1899 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1900 : StackSequenceContextIds;
1901 assert(!SavedContextIds.empty());
1902
1903 if (!DuplicateContextIds) {
1904 // Update saved last node's context ids to remove those that are
1905 // assigned to other calls, so that it is ready for the next call at
1906 // this stack id.
1907 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1908 if (LastNodeContextIds.empty())
1909 break;
1910 }
1911 }
1912 }
1913
1914 // Propagate the duplicate context ids over the graph.
1915 propagateDuplicateContextIds(OldToNewContextIds);
1916
1917 if (VerifyCCG)
1918 check();
1919
1920 // Now perform a post-order traversal over the graph, starting with the
1921 // allocation nodes, essentially processing nodes from callers to callees.
1922 // For any that contains an id in the map, update the graph to contain new
1923 // nodes representing any inlining at interior callsites. Note we move the
1924 // associated context ids over to the new nodes.
1926 for (auto &Entry : AllocationCallToContextNodeMap)
1927 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1928 CallToMatchingCall);
1929 if (VerifyCCG)
1930 check();
1931}
1932
1933uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1935 Call->getMetadata(LLVMContext::MD_callsite));
1936 return CallsiteContext.back();
1937}
1938
1939uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1940 assert(isa<CallsiteInfo *>(Call));
1942 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1943 // Need to convert index into stack id.
1944 return Index.getStackIdAtIndex(CallsiteContext.back());
1945}
1946
1947static const std::string MemProfCloneSuffix = ".memprof.";
1948
1949static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1950 // We use CloneNo == 0 to refer to the original version, which doesn't get
1951 // renamed with a suffix.
1952 if (!CloneNo)
1953 return Base.str();
1954 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1955}
1956
1957static bool isMemProfClone(const Function &F) {
1958 return F.getName().contains(MemProfCloneSuffix);
1959}
1960
1961std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1962 const Instruction *Call,
1963 unsigned CloneNo) const {
1964 return (Twine(Call->getFunction()->getName()) + " -> " +
1965 cast<CallBase>(Call)->getCalledFunction()->getName())
1966 .str();
1967}
1968
1969std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1970 const IndexCall &Call,
1971 unsigned CloneNo) const {
1972 auto VI = FSToVIMap.find(Func);
1973 assert(VI != FSToVIMap.end());
1974 if (isa<AllocInfo *>(Call))
1975 return (VI->second.name() + " -> alloc").str();
1976 else {
1977 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
1978 return (VI->second.name() + " -> " +
1979 getMemProfFuncName(Callsite->Callee.name(),
1980 Callsite->Clones[CloneNo]))
1981 .str();
1982 }
1983}
1984
1985std::vector<uint64_t>
1986ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1987 Instruction *Call) {
1989 Call->getMetadata(LLVMContext::MD_callsite));
1990 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1991 CallsiteContext);
1992}
1993
1994std::vector<uint64_t>
1995IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1996 assert(isa<CallsiteInfo *>(Call));
1998 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1999 return getStackIdsWithContextNodes<CallsiteInfo,
2001 CallsiteContext);
2002}
2003
2004template <typename DerivedCCG, typename FuncTy, typename CallTy>
2005template <class NodeT, class IteratorT>
2006std::vector<uint64_t>
2007CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2008 CallStack<NodeT, IteratorT> &CallsiteContext) {
2009 std::vector<uint64_t> StackIds;
2010 for (auto IdOrIndex : CallsiteContext) {
2011 auto StackId = getStackId(IdOrIndex);
2012 ContextNode *Node = getNodeForStackId(StackId);
2013 if (!Node)
2014 break;
2015 StackIds.push_back(StackId);
2016 }
2017 return StackIds;
2018}
2019
2020ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2021 Module &M,
2023 : Mod(M), OREGetter(OREGetter) {
2024 for (auto &F : M) {
2025 std::vector<CallInfo> CallsWithMetadata;
2026 for (auto &BB : F) {
2027 for (auto &I : BB) {
2028 if (!isa<CallBase>(I))
2029 continue;
2030 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2031 CallsWithMetadata.push_back(&I);
2032 auto *AllocNode = addAllocNode(&I, &F);
2033 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2034 assert(CallsiteMD);
2035 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2036 // Add all of the MIBs and their stack nodes.
2037 for (auto &MDOp : MemProfMD->operands()) {
2038 auto *MIBMD = cast<const MDNode>(MDOp);
2039 std::vector<ContextTotalSize> ContextSizeInfo;
2040 // Collect the context size information if it exists.
2041 if (MIBMD->getNumOperands() > 2) {
2042 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2043 MDNode *ContextSizePair =
2044 dyn_cast<MDNode>(MIBMD->getOperand(I));
2045 assert(ContextSizePair->getNumOperands() == 2);
2046 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2047 ContextSizePair->getOperand(0))
2048 ->getZExtValue();
2049 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2050 ContextSizePair->getOperand(1))
2051 ->getZExtValue();
2052 ContextSizeInfo.push_back({FullStackId, TotalSize});
2053 }
2054 }
2058 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2059 AllocNode, StackContext, CallsiteContext,
2060 getMIBAllocType(MIBMD), ContextSizeInfo);
2061 }
2062 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2063 // Memprof and callsite metadata on memory allocations no longer
2064 // needed.
2065 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2066 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2067 }
2068 // For callsite metadata, add to list for this function for later use.
2069 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2070 CallsWithMetadata.push_back(&I);
2071 }
2072 }
2073 }
2074 if (!CallsWithMetadata.empty())
2075 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2076 }
2077
2078 if (DumpCCG) {
2079 dbgs() << "CCG before updating call stack chains:\n";
2080 dbgs() << *this;
2081 }
2082
2083 if (ExportToDot)
2084 exportToDot("prestackupdate");
2085
2086 updateStackNodes();
2087
2088 handleCallsitesWithMultipleTargets();
2089
2090 // Strip off remaining callsite metadata, no longer needed.
2091 for (auto &FuncEntry : FuncToCallsWithMetadata)
2092 for (auto &Call : FuncEntry.second)
2093 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2094}
2095
2096IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2097 ModuleSummaryIndex &Index,
2099 isPrevailing)
2100 : Index(Index), isPrevailing(isPrevailing) {
2101 for (auto &I : Index) {
2102 auto VI = Index.getValueInfo(I);
2103 for (auto &S : VI.getSummaryList()) {
2104 // We should only add the prevailing nodes. Otherwise we may try to clone
2105 // in a weak copy that won't be linked (and may be different than the
2106 // prevailing version).
2107 // We only keep the memprof summary on the prevailing copy now when
2108 // building the combined index, as a space optimization, however don't
2109 // rely on this optimization. The linker doesn't resolve local linkage
2110 // values so don't check whether those are prevailing.
2111 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2112 !isPrevailing(VI.getGUID(), S.get()))
2113 continue;
2114 auto *FS = dyn_cast<FunctionSummary>(S.get());
2115 if (!FS)
2116 continue;
2117 std::vector<CallInfo> CallsWithMetadata;
2118 if (!FS->allocs().empty()) {
2119 for (auto &AN : FS->mutableAllocs()) {
2120 // This can happen because of recursion elimination handling that
2121 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2122 // We still added them to the summary because we need to be able to
2123 // correlate properly in applyImport in the backends.
2124 if (AN.MIBs.empty())
2125 continue;
2126 IndexCall AllocCall(&AN);
2127 CallsWithMetadata.push_back(AllocCall);
2128 auto *AllocNode = addAllocNode(AllocCall, FS);
2129 // Pass an empty CallStack to the CallsiteContext (second)
2130 // parameter, since for ThinLTO we already collapsed out the inlined
2131 // stack ids on the allocation call during ModuleSummaryAnalysis.
2133 EmptyContext;
2134 unsigned I = 0;
2135 assert(
2137 AN.ContextSizeInfos.size() == AN.MIBs.size());
2138 // Now add all of the MIBs and their stack nodes.
2139 for (auto &MIB : AN.MIBs) {
2141 StackContext(&MIB);
2142 std::vector<ContextTotalSize> ContextSizeInfo;
2143 if (!AN.ContextSizeInfos.empty()) {
2144 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2145 ContextSizeInfo.push_back({FullStackId, TotalSize});
2146 }
2147 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2148 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2149 ContextSizeInfo);
2150 I++;
2151 }
2152 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2153 // Initialize version 0 on the summary alloc node to the current alloc
2154 // type, unless it has both types in which case make it default, so
2155 // that in the case where we aren't able to clone the original version
2156 // always ends up with the default allocation behavior.
2157 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2158 }
2159 }
2160 // For callsite metadata, add to list for this function for later use.
2161 if (!FS->callsites().empty())
2162 for (auto &SN : FS->mutableCallsites()) {
2163 IndexCall StackNodeCall(&SN);
2164 CallsWithMetadata.push_back(StackNodeCall);
2165 }
2166
2167 if (!CallsWithMetadata.empty())
2168 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2169
2170 if (!FS->allocs().empty() || !FS->callsites().empty())
2171 FSToVIMap[FS] = VI;
2172 }
2173 }
2174
2175 if (DumpCCG) {
2176 dbgs() << "CCG before updating call stack chains:\n";
2177 dbgs() << *this;
2178 }
2179
2180 if (ExportToDot)
2181 exportToDot("prestackupdate");
2182
2183 updateStackNodes();
2184
2185 handleCallsitesWithMultipleTargets();
2186}
2187
2188template <typename DerivedCCG, typename FuncTy, typename CallTy>
2189void CallsiteContextGraph<DerivedCCG, FuncTy,
2190 CallTy>::handleCallsitesWithMultipleTargets() {
2191 // Look for and workaround callsites that call multiple functions.
2192 // This can happen for indirect calls, which needs better handling, and in
2193 // more rare cases (e.g. macro expansion).
2194 // TODO: To fix this for indirect calls we will want to perform speculative
2195 // devirtualization using either the normal PGO info with ICP, or using the
2196 // information in the profiled MemProf contexts. We can do this prior to
2197 // this transformation for regular LTO, and for ThinLTO we can simulate that
2198 // effect in the summary and perform the actual speculative devirtualization
2199 // while cloning in the ThinLTO backend.
2200
2201 // Keep track of the new nodes synthesized for discovered tail calls missing
2202 // from the profiled contexts.
2203 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2204
2205 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2206 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2207 auto *Node = Entry.second;
2208 assert(Node->Clones.empty());
2209 // Check all node callees and see if in the same function.
2210 // We need to check all of the calls recorded in this Node, because in some
2211 // cases we may have had multiple calls with the same debug info calling
2212 // different callees. This can happen, for example, when an object is
2213 // constructed in the paramter list - the destructor call of the object has
2214 // the same debug info (line/col) as the call the object was passed to.
2215 // Here we will prune any that don't match all callee nodes.
2216 std::vector<CallInfo> AllCalls;
2217 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2218 AllCalls.push_back(Node->Call);
2219 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2220 Node->MatchingCalls.end());
2221
2222 // First see if we can partition the calls by callee function, creating new
2223 // nodes to host each set of calls calling the same callees. This is
2224 // necessary for support indirect calls with ThinLTO, for which we
2225 // synthesized CallsiteInfo records for each target. They will all have the
2226 // same callsite stack ids and would be sharing a context node at this
2227 // point. We need to perform separate cloning for each, which will be
2228 // applied along with speculative devirtualization in the ThinLTO backends
2229 // as needed. Note this does not currently support looking through tail
2230 // calls, it is unclear if we need that for indirect call targets.
2231 // First partition calls by callee func. Map indexed by func, value is
2232 // struct with list of matching calls, assigned node.
2233 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2234 continue;
2235
2236 auto It = AllCalls.begin();
2237 // Iterate through the calls until we find the first that matches.
2238 for (; It != AllCalls.end(); ++It) {
2239 auto ThisCall = *It;
2240 bool Match = true;
2241 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2242 ++EI) {
2243 auto Edge = *EI;
2244 if (!Edge->Callee->hasCall())
2245 continue;
2246 assert(NodeToCallingFunc.count(Edge->Callee));
2247 // Check if the called function matches that of the callee node.
2248 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2249 Match = false;
2250 break;
2251 }
2252 }
2253 // Found a call that matches the callee nodes, we can quit now.
2254 if (Match) {
2255 // If the first match is not the primary call on the Node, update it
2256 // now. We will update the list of matching calls further below.
2257 if (Node->Call != ThisCall) {
2258 Node->setCall(ThisCall);
2259 // We need to update the NonAllocationCallToContextNodeMap, but don't
2260 // want to do this during iteration over that map, so save the calls
2261 // that need updated entries.
2262 NewCallToNode.push_back({ThisCall, Node});
2263 }
2264 break;
2265 }
2266 }
2267 // We will update this list below (or leave it cleared if there was no
2268 // match found above).
2269 Node->MatchingCalls.clear();
2270 // If we hit the end of the AllCalls vector, no call matching the callee
2271 // nodes was found, clear the call information in the node.
2272 if (It == AllCalls.end()) {
2273 RemovedEdgesWithMismatchedCallees++;
2274 // Work around by setting Node to have a null call, so it gets
2275 // skipped during cloning. Otherwise assignFunctions will assert
2276 // because its data structures are not designed to handle this case.
2277 Node->setCall(CallInfo());
2278 continue;
2279 }
2280 // Now add back any matching calls that call the same function as the
2281 // matching primary call on Node.
2282 for (++It; It != AllCalls.end(); ++It) {
2283 auto ThisCall = *It;
2284 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2285 continue;
2286 Node->MatchingCalls.push_back(ThisCall);
2287 }
2288 }
2289
2290 // Remove all mismatched nodes identified in the above loop from the node map
2291 // (checking whether they have a null call which is set above). For a
2292 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2293 // to do the removal via remove_if than by individually erasing entries above.
2294 // Also remove any entries if we updated the node's primary call above.
2295 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2296 return !it.second->hasCall() || it.second->Call != it.first;
2297 });
2298
2299 // Add entries for any new primary calls recorded above.
2300 for (auto &[Call, Node] : NewCallToNode)
2301 NonAllocationCallToContextNodeMap[Call] = Node;
2302
2303 // Add the new nodes after the above loop so that the iteration is not
2304 // invalidated.
2305 for (auto &[Call, Node] : TailCallToContextNodeMap)
2306 NonAllocationCallToContextNodeMap[Call] = Node;
2307}
2308
2309template <typename DerivedCCG, typename FuncTy, typename CallTy>
2310bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2311 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2312 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2313 // Struct to keep track of all the calls having the same callee function,
2314 // and the node we eventually assign to them. Eventually we will record the
2315 // context node assigned to this group of calls.
2316 struct CallsWithSameCallee {
2317 std::vector<CallInfo> Calls;
2318 ContextNode *Node = nullptr;
2319 };
2320
2321 // First partition calls by callee function. Build map from each function
2322 // to the list of matching calls.
2324 for (auto ThisCall : AllCalls) {
2325 auto *F = getCalleeFunc(ThisCall.call());
2326 if (F)
2327 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2328 }
2329
2330 // Next, walk through all callee edges. For each callee node, get its
2331 // containing function and see if it was recorded in the above map (meaning we
2332 // have at least one matching call). Build another map from each callee node
2333 // with a matching call to the structure instance created above containing all
2334 // the calls.
2336 for (const auto &Edge : Node->CalleeEdges) {
2337 if (!Edge->Callee->hasCall())
2338 continue;
2339 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2340 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2341 CalleeNodeToCallInfo[Edge->Callee] =
2342 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2343 }
2344
2345 // If there are entries in the second map, then there were no matching
2346 // calls/callees, nothing to do here. Return so we can go to the handling that
2347 // looks through tail calls.
2348 if (CalleeNodeToCallInfo.empty())
2349 return false;
2350
2351 // Walk through all callee edges again. Any and all callee edges that didn't
2352 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2353 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2354 // ignored during cloning. If it is in the map, then we use the node recorded
2355 // in that entry (creating it if needed), and move the callee edge to it.
2356 // The first callee will use the original node instead of creating a new one.
2357 // Note that any of the original calls on this node (in AllCalls) that didn't
2358 // have a callee function automatically get dropped from the node as part of
2359 // this process.
2360 ContextNode *UnmatchedCalleesNode = nullptr;
2361 // Track whether we already assigned original node to a callee.
2362 bool UsedOrigNode = false;
2363 assert(NodeToCallingFunc[Node]);
2364 // Iterate over a copy of Node's callee edges, since we may need to remove
2365 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2366 // makes it less error-prone.
2367 auto CalleeEdges = Node->CalleeEdges;
2368 for (auto &Edge : CalleeEdges) {
2369 if (!Edge->Callee->hasCall())
2370 continue;
2371
2372 // Will be updated below to point to whatever (caller) node this callee edge
2373 // should be moved to.
2374 ContextNode *CallerNodeToUse = nullptr;
2375
2376 // Handle the case where there were no matching calls first. Move this
2377 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2378 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2379 if (!UnmatchedCalleesNode)
2380 UnmatchedCalleesNode =
2381 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2382 CallerNodeToUse = UnmatchedCalleesNode;
2383 } else {
2384 // Look up the information recorded for this callee node, and use the
2385 // recorded caller node (creating it if needed).
2386 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2387 if (!Info->Node) {
2388 // If we haven't assigned any callees to the original node use it.
2389 if (!UsedOrigNode) {
2390 Info->Node = Node;
2391 // Clear the set of matching calls which will be updated below.
2392 Node->MatchingCalls.clear();
2393 UsedOrigNode = true;
2394 } else
2395 Info->Node =
2396 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2397 assert(!Info->Calls.empty());
2398 // The first call becomes the primary call for this caller node, and the
2399 // rest go in the matching calls list.
2400 Info->Node->setCall(Info->Calls.front());
2401 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2402 Info->Calls.begin() + 1,
2403 Info->Calls.end());
2404 // Save the primary call to node correspondence so that we can update
2405 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2406 // caller of this function.
2407 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2408 }
2409 CallerNodeToUse = Info->Node;
2410 }
2411
2412 // Don't need to move edge if we are using the original node;
2413 if (CallerNodeToUse == Node)
2414 continue;
2415
2416 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2417 }
2418 // Now that we are done moving edges, clean up any caller edges that ended
2419 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2420 // caller edges from Node are replicated onto the new callers, and it
2421 // simplifies the handling to leave them until we have moved all
2422 // edges/context ids.
2423 for (auto &I : CalleeNodeToCallInfo)
2424 removeNoneTypeCallerEdges(I.second->Node);
2425 if (UnmatchedCalleesNode)
2426 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2427 removeNoneTypeCallerEdges(Node);
2428
2429 return true;
2430}
2431
2432uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2433 // In the Module (IR) case this is already the Id.
2434 return IdOrIndex;
2435}
2436
2437uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2438 // In the Index case this is an index into the stack id list in the summary
2439 // index, convert it to an Id.
2440 return Index.getStackIdAtIndex(IdOrIndex);
2441}
2442
2443template <typename DerivedCCG, typename FuncTy, typename CallTy>
2444bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2445 CallTy Call, EdgeIter &EI,
2446 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2447 auto Edge = *EI;
2448 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2449 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2450 // Will be populated in order of callee to caller if we find a chain of tail
2451 // calls between the profiled caller and callee.
2452 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2453 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2454 FoundCalleeChain))
2455 return false;
2456
2457 // The usual case where the profiled callee matches that of the IR/summary.
2458 if (FoundCalleeChain.empty())
2459 return true;
2460
2461 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2462 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2463 // If there is already an edge between these nodes, simply update it and
2464 // return.
2465 if (CurEdge) {
2466 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2467 Edge->ContextIds.end());
2468 CurEdge->AllocTypes |= Edge->AllocTypes;
2469 return;
2470 }
2471 // Otherwise, create a new edge and insert it into the caller and callee
2472 // lists.
2473 auto NewEdge = std::make_shared<ContextEdge>(
2474 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2475 Callee->CallerEdges.push_back(NewEdge);
2476 if (Caller == Edge->Caller) {
2477 // If we are inserting the new edge into the current edge's caller, insert
2478 // the new edge before the current iterator position, and then increment
2479 // back to the current edge.
2480 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2481 ++EI;
2482 assert(*EI == Edge &&
2483 "Iterator position not restored after insert and increment");
2484 } else
2485 Caller->CalleeEdges.push_back(NewEdge);
2486 };
2487
2488 // Create new nodes for each found callee and connect in between the profiled
2489 // caller and callee.
2490 auto *CurCalleeNode = Edge->Callee;
2491 for (auto &[NewCall, Func] : FoundCalleeChain) {
2492 ContextNode *NewNode = nullptr;
2493 // First check if we have already synthesized a node for this tail call.
2494 if (TailCallToContextNodeMap.count(NewCall)) {
2495 NewNode = TailCallToContextNodeMap[NewCall];
2496 NewNode->AllocTypes |= Edge->AllocTypes;
2497 } else {
2498 FuncToCallsWithMetadata[Func].push_back({NewCall});
2499 // Create Node and record node info.
2500 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2501 TailCallToContextNodeMap[NewCall] = NewNode;
2502 NewNode->AllocTypes = Edge->AllocTypes;
2503 }
2504
2505 // Hook up node to its callee node
2506 AddEdge(NewNode, CurCalleeNode);
2507
2508 CurCalleeNode = NewNode;
2509 }
2510
2511 // Hook up edge's original caller to new callee node.
2512 AddEdge(Edge->Caller, CurCalleeNode);
2513
2514#ifndef NDEBUG
2515 // Save this because Edge's fields get cleared below when removed.
2516 auto *Caller = Edge->Caller;
2517#endif
2518
2519 // Remove old edge
2520 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2521
2522 // To simplify the increment of EI in the caller, subtract one from EI.
2523 // In the final AddEdge call we would have either added a new callee edge,
2524 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2525 // that there is at least one callee edge.
2526 assert(!Caller->CalleeEdges.empty());
2527 --EI;
2528
2529 return true;
2530}
2531
2532bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2533 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2534 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2535 bool &FoundMultipleCalleeChains) {
2536 // Stop recursive search if we have already explored the maximum specified
2537 // depth.
2539 return false;
2540
2541 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2542 FoundCalleeChain.push_back({Callsite, F});
2543 };
2544
2545 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2546 if (!CalleeFunc) {
2547 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2548 assert(Alias);
2549 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2550 assert(CalleeFunc);
2551 }
2552
2553 // Look for tail calls in this function, and check if they either call the
2554 // profiled callee directly, or indirectly (via a recursive search).
2555 // Only succeed if there is a single unique tail call chain found between the
2556 // profiled caller and callee, otherwise we could perform incorrect cloning.
2557 bool FoundSingleCalleeChain = false;
2558 for (auto &BB : *CalleeFunc) {
2559 for (auto &I : BB) {
2560 auto *CB = dyn_cast<CallBase>(&I);
2561 if (!CB || !CB->isTailCall())
2562 continue;
2563 auto *CalledValue = CB->getCalledOperand();
2564 auto *CalledFunction = CB->getCalledFunction();
2565 if (CalledValue && !CalledFunction) {
2566 CalledValue = CalledValue->stripPointerCasts();
2567 // Stripping pointer casts can reveal a called function.
2568 CalledFunction = dyn_cast<Function>(CalledValue);
2569 }
2570 // Check if this is an alias to a function. If so, get the
2571 // called aliasee for the checks below.
2572 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2573 assert(!CalledFunction &&
2574 "Expected null called function in callsite for alias");
2575 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2576 }
2577 if (!CalledFunction)
2578 continue;
2579 if (CalledFunction == ProfiledCallee) {
2580 if (FoundSingleCalleeChain) {
2581 FoundMultipleCalleeChains = true;
2582 return false;
2583 }
2584 FoundSingleCalleeChain = true;
2585 FoundProfiledCalleeCount++;
2586 FoundProfiledCalleeDepth += Depth;
2587 if (Depth > FoundProfiledCalleeMaxDepth)
2588 FoundProfiledCalleeMaxDepth = Depth;
2589 SaveCallsiteInfo(&I, CalleeFunc);
2590 } else if (findProfiledCalleeThroughTailCalls(
2591 ProfiledCallee, CalledFunction, Depth + 1,
2592 FoundCalleeChain, FoundMultipleCalleeChains)) {
2593 // findProfiledCalleeThroughTailCalls should not have returned
2594 // true if FoundMultipleCalleeChains.
2595 assert(!FoundMultipleCalleeChains);
2596 if (FoundSingleCalleeChain) {
2597 FoundMultipleCalleeChains = true;
2598 return false;
2599 }
2600 FoundSingleCalleeChain = true;
2601 SaveCallsiteInfo(&I, CalleeFunc);
2602 } else if (FoundMultipleCalleeChains)
2603 return false;
2604 }
2605 }
2606
2607 return FoundSingleCalleeChain;
2608}
2609
2610const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2611 auto *CB = dyn_cast<CallBase>(Call);
2612 if (!CB->getCalledOperand() || CB->isIndirectCall())
2613 return nullptr;
2614 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2615 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2616 if (Alias)
2617 return dyn_cast<Function>(Alias->getAliasee());
2618 return dyn_cast<Function>(CalleeVal);
2619}
2620
2621bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2622 Instruction *Call, const Function *Func, const Function *CallerFunc,
2623 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2624 auto *CB = dyn_cast<CallBase>(Call);
2625 if (!CB->getCalledOperand() || CB->isIndirectCall())
2626 return false;
2627 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2628 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2629 if (CalleeFunc == Func)
2630 return true;
2631 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2632 if (Alias && Alias->getAliasee() == Func)
2633 return true;
2634
2635 // Recursively search for the profiled callee through tail calls starting with
2636 // the actual Callee. The discovered tail call chain is saved in
2637 // FoundCalleeChain, and we will fixup the graph to include these callsites
2638 // after returning.
2639 // FIXME: We will currently redo the same recursive walk if we find the same
2640 // mismatched callee from another callsite. We can improve this with more
2641 // bookkeeping of the created chain of new nodes for each mismatch.
2642 unsigned Depth = 1;
2643 bool FoundMultipleCalleeChains = false;
2644 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2645 FoundCalleeChain,
2646 FoundMultipleCalleeChains)) {
2647 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2648 << Func->getName() << " from " << CallerFunc->getName()
2649 << " that actually called " << CalleeVal->getName()
2650 << (FoundMultipleCalleeChains
2651 ? " (found multiple possible chains)"
2652 : "")
2653 << "\n");
2654 if (FoundMultipleCalleeChains)
2655 FoundProfiledCalleeNonUniquelyCount++;
2656 return false;
2657 }
2658
2659 return true;
2660}
2661
2662bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2663 Instruction *Call2) {
2664 auto *CB1 = cast<CallBase>(Call1);
2665 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2666 return false;
2667 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2668 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2669 auto *CB2 = cast<CallBase>(Call2);
2670 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2671 return false;
2672 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2673 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2674 return CalleeFunc1 == CalleeFunc2;
2675}
2676
2677bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2678 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2679 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2680 bool &FoundMultipleCalleeChains) {
2681 // Stop recursive search if we have already explored the maximum specified
2682 // depth.
2684 return false;
2685
2686 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2687 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2688 // been synthesized.
2689 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2690 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2691 // StackIds is empty (we don't have debug info available in the index for
2692 // these callsites)
2693 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2694 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2695 CallsiteInfo *NewCallsiteInfo =
2696 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2697 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2698 };
2699
2700 // Look for tail calls in this function, and check if they either call the
2701 // profiled callee directly, or indirectly (via a recursive search).
2702 // Only succeed if there is a single unique tail call chain found between the
2703 // profiled caller and callee, otherwise we could perform incorrect cloning.
2704 bool FoundSingleCalleeChain = false;
2705 for (auto &S : CurCallee.getSummaryList()) {
2706 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2707 !isPrevailing(CurCallee.getGUID(), S.get()))
2708 continue;
2709 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2710 if (!FS)
2711 continue;
2712 auto FSVI = CurCallee;
2713 auto *AS = dyn_cast<AliasSummary>(S.get());
2714 if (AS)
2715 FSVI = AS->getAliaseeVI();
2716 for (auto &CallEdge : FS->calls()) {
2717 if (!CallEdge.second.hasTailCall())
2718 continue;
2719 if (CallEdge.first == ProfiledCallee) {
2720 if (FoundSingleCalleeChain) {
2721 FoundMultipleCalleeChains = true;
2722 return false;
2723 }
2724 FoundSingleCalleeChain = true;
2725 FoundProfiledCalleeCount++;
2726 FoundProfiledCalleeDepth += Depth;
2727 if (Depth > FoundProfiledCalleeMaxDepth)
2728 FoundProfiledCalleeMaxDepth = Depth;
2729 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2730 // Add FS to FSToVIMap in case it isn't already there.
2731 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2732 FSToVIMap[FS] = FSVI;
2733 } else if (findProfiledCalleeThroughTailCalls(
2734 ProfiledCallee, CallEdge.first, Depth + 1,
2735 FoundCalleeChain, FoundMultipleCalleeChains)) {
2736 // findProfiledCalleeThroughTailCalls should not have returned
2737 // true if FoundMultipleCalleeChains.
2738 assert(!FoundMultipleCalleeChains);
2739 if (FoundSingleCalleeChain) {
2740 FoundMultipleCalleeChains = true;
2741 return false;
2742 }
2743 FoundSingleCalleeChain = true;
2744 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2745 // Add FS to FSToVIMap in case it isn't already there.
2746 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2747 FSToVIMap[FS] = FSVI;
2748 } else if (FoundMultipleCalleeChains)
2749 return false;
2750 }
2751 }
2752
2753 return FoundSingleCalleeChain;
2754}
2755
2756const FunctionSummary *
2757IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2758 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2759 if (Callee.getSummaryList().empty())
2760 return nullptr;
2761 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2762}
2763
2764bool IndexCallsiteContextGraph::calleeMatchesFunc(
2765 IndexCall &Call, const FunctionSummary *Func,
2766 const FunctionSummary *CallerFunc,
2767 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2768 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2769 // If there is no summary list then this is a call to an externally defined
2770 // symbol.
2771 AliasSummary *Alias =
2772 Callee.getSummaryList().empty()
2773 ? nullptr
2774 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2775 assert(FSToVIMap.count(Func));
2776 auto FuncVI = FSToVIMap[Func];
2777 if (Callee == FuncVI ||
2778 // If callee is an alias, check the aliasee, since only function
2779 // summary base objects will contain the stack node summaries and thus
2780 // get a context node.
2781 (Alias && Alias->getAliaseeVI() == FuncVI))
2782 return true;
2783
2784 // Recursively search for the profiled callee through tail calls starting with
2785 // the actual Callee. The discovered tail call chain is saved in
2786 // FoundCalleeChain, and we will fixup the graph to include these callsites
2787 // after returning.
2788 // FIXME: We will currently redo the same recursive walk if we find the same
2789 // mismatched callee from another callsite. We can improve this with more
2790 // bookkeeping of the created chain of new nodes for each mismatch.
2791 unsigned Depth = 1;
2792 bool FoundMultipleCalleeChains = false;
2793 if (!findProfiledCalleeThroughTailCalls(
2794 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2795 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2796 << " from " << FSToVIMap[CallerFunc]
2797 << " that actually called " << Callee
2798 << (FoundMultipleCalleeChains
2799 ? " (found multiple possible chains)"
2800 : "")
2801 << "\n");
2802 if (FoundMultipleCalleeChains)
2803 FoundProfiledCalleeNonUniquelyCount++;
2804 return false;
2805 }
2806
2807 return true;
2808}
2809
2810bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2811 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2812 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2813 return Callee1 == Callee2;
2814}
2815
2816template <typename DerivedCCG, typename FuncTy, typename CallTy>
2817void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2818 const {
2819 print(dbgs());
2820 dbgs() << "\n";
2821}
2822
2823template <typename DerivedCCG, typename FuncTy, typename CallTy>
2824void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2825 raw_ostream &OS) const {
2826 OS << "Node " << this << "\n";
2827 OS << "\t";
2828 printCall(OS);
2829 if (Recursive)
2830 OS << " (recursive)";
2831 OS << "\n";
2832 if (!MatchingCalls.empty()) {
2833 OS << "\tMatchingCalls:\n";
2834 for (auto &MatchingCall : MatchingCalls) {
2835 OS << "\t";
2836 MatchingCall.print(OS);
2837 OS << "\n";
2838 }
2839 }
2840 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2841 OS << "\tContextIds:";
2842 // Make a copy of the computed context ids that we can sort for stability.
2843 auto ContextIds = getContextIds();
2844 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2845 std::sort(SortedIds.begin(), SortedIds.end());
2846 for (auto Id : SortedIds)
2847 OS << " " << Id;
2848 OS << "\n";
2849 OS << "\tCalleeEdges:\n";
2850 for (auto &Edge : CalleeEdges)
2851 OS << "\t\t" << *Edge << "\n";
2852 OS << "\tCallerEdges:\n";
2853 for (auto &Edge : CallerEdges)
2854 OS << "\t\t" << *Edge << "\n";
2855 if (!Clones.empty()) {
2856 OS << "\tClones: ";
2857 ListSeparator LS;
2858 for (auto *Clone : Clones)
2859 OS << LS << Clone;
2860 OS << "\n";
2861 } else if (CloneOf) {
2862 OS << "\tClone of " << CloneOf << "\n";
2863 }
2864}
2865
2866template <typename DerivedCCG, typename FuncTy, typename CallTy>
2867void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2868 const {
2869 print(dbgs());
2870 dbgs() << "\n";
2871}
2872
2873template <typename DerivedCCG, typename FuncTy, typename CallTy>
2874void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2875 raw_ostream &OS) const {
2876 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2877 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2878 OS << " ContextIds:";
2879 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2880 std::sort(SortedIds.begin(), SortedIds.end());
2881 for (auto Id : SortedIds)
2882 OS << " " << Id;
2883}
2884
2885template <typename DerivedCCG, typename FuncTy, typename CallTy>
2886void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2887 print(dbgs());
2888}
2889
2890template <typename DerivedCCG, typename FuncTy, typename CallTy>
2891void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2892 raw_ostream &OS) const {
2893 OS << "Callsite Context Graph:\n";
2894 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2895 for (const auto Node : nodes<GraphType>(this)) {
2896 if (Node->isRemoved())
2897 continue;
2898 Node->print(OS);
2899 OS << "\n";
2900 }
2901}
2902
2903template <typename DerivedCCG, typename FuncTy, typename CallTy>
2904void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2905 raw_ostream &OS) const {
2906 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2907 for (const auto Node : nodes<GraphType>(this)) {
2908 if (Node->isRemoved())
2909 continue;
2910 if (!Node->IsAllocation)
2911 continue;
2912 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2913 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2914 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2915 std::sort(SortedIds.begin(), SortedIds.end());
2916 for (auto Id : SortedIds) {
2917 auto TypeI = ContextIdToAllocationType.find(Id);
2918 assert(TypeI != ContextIdToAllocationType.end());
2919 auto CSI = ContextIdToContextSizeInfos.find(Id);
2920 if (CSI != ContextIdToContextSizeInfos.end()) {
2921 for (auto &Info : CSI->second) {
2922 OS << "MemProf hinting: "
2923 << getAllocTypeString((uint8_t)TypeI->second)
2924 << " full allocation context " << Info.FullStackId
2925 << " with total size " << Info.TotalSize << " is "
2926 << getAllocTypeString(Node->AllocTypes) << " after cloning";
2927 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2928 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2929 << " due to cold byte percent";
2930 OS << "\n";
2931 }
2932 }
2933 }
2934 }
2935}
2936
2937template <typename DerivedCCG, typename FuncTy, typename CallTy>
2938void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2939 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2940 for (const auto Node : nodes<GraphType>(this)) {
2941 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2942 for (auto &Edge : Node->CallerEdges)
2943 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2944 }
2945}
2946
2947template <typename DerivedCCG, typename FuncTy, typename CallTy>
2948struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2949 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2950 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2951
2952 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2953 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2954
2957 decltype(&getNode)>;
2958
2960 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2961 }
2962
2964 return nodes_iterator(G->NodeOwner.end(), &getNode);
2965 }
2966
2968 return G->NodeOwner.begin()->get();
2969 }
2970
2971 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2972 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2974 return P->Callee;
2975 }
2976
2978 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2979 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2980 decltype(&GetCallee)>;
2981
2983 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2984 }
2985
2987 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2988 }
2989};
2990
2991template <typename DerivedCCG, typename FuncTy, typename CallTy>
2992struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2993 : public DefaultDOTGraphTraits {
2994 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2995
2996 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2998 using NodeRef = typename GTraits::NodeRef;
2999 using ChildIteratorType = typename GTraits::ChildIteratorType;
3000
3001 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3002 std::string LabelString =
3003 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3004 Twine(Node->OrigStackOrAllocId))
3005 .str();
3006 LabelString += "\n";
3007 if (Node->hasCall()) {
3008 auto Func = G->NodeToCallingFunc.find(Node);
3009 assert(Func != G->NodeToCallingFunc.end());
3010 LabelString +=
3011 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3012 } else {
3013 LabelString += "null call";
3014 if (Node->Recursive)
3015 LabelString += " (recursive)";
3016 else
3017 LabelString += " (external)";
3018 }
3019 return LabelString;
3020 }
3021
3022 static std::string getNodeAttributes(NodeRef Node, GraphType) {
3023 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3024 getContextIds(Node->getContextIds()) + "\"")
3025 .str();
3026 AttributeString +=
3027 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
3028 AttributeString += ",style=\"filled\"";
3029 if (Node->CloneOf) {
3030 AttributeString += ",color=\"blue\"";
3031 AttributeString += ",style=\"filled,bold,dashed\"";
3032 } else
3033 AttributeString += ",style=\"filled\"";
3034 return AttributeString;
3035 }
3036
3037 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3038 GraphType) {
3039 auto &Edge = *(ChildIter.getCurrent());
3040 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3041 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
3042 .str();
3043 }
3044
3045 // Since the NodeOwners list includes nodes that are no longer connected to
3046 // the graph, skip them here.
3047 static bool isNodeHidden(NodeRef Node, GraphType) {
3048 return Node->isRemoved();
3049 }
3050
3051private:
3052 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3053 std::string IdString = "ContextIds:";
3054 if (ContextIds.size() < 100) {
3055 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3056 std::sort(SortedIds.begin(), SortedIds.end());
3057 for (auto Id : SortedIds)
3058 IdString += (" " + Twine(Id)).str();
3059 } else {
3060 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3061 }
3062 return IdString;
3063 }
3064
3065 static std::string getColor(uint8_t AllocTypes) {
3066 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3067 // Color "brown1" actually looks like a lighter red.
3068 return "brown1";
3069 if (AllocTypes == (uint8_t)AllocationType::Cold)
3070 return "cyan";
3071 if (AllocTypes ==
3073 // Lighter purple.
3074 return "mediumorchid1";
3075 return "gray";
3076 }
3077
3078 static std::string getNodeId(NodeRef Node) {
3079 std::stringstream SStream;
3080 SStream << std::hex << "N0x" << (unsigned long long)Node;
3081 std::string Result = SStream.str();
3082 return Result;
3083 }
3084};
3085
3086template <typename DerivedCCG, typename FuncTy, typename CallTy>
3087void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3088 std::string Label) const {
3089 WriteGraph(this, "", false, Label,
3090 DotFilePathPrefix + "ccg." + Label + ".dot");
3091}
3092
3093template <typename DerivedCCG, typename FuncTy, typename CallTy>
3094typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3095CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3096 const std::shared_ptr<ContextEdge> &Edge,
3097 DenseSet<uint32_t> ContextIdsToMove) {
3098 ContextNode *Node = Edge->Callee;
3099 assert(NodeToCallingFunc.count(Node));
3100 ContextNode *Clone =
3101 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3102 Node->addClone(Clone);
3103 Clone->MatchingCalls = Node->MatchingCalls;
3104 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3105 ContextIdsToMove);
3106 return Clone;
3107}
3108
3109template <typename DerivedCCG, typename FuncTy, typename CallTy>
3110void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3111 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3112 ContextNode *NewCallee, bool NewClone,
3113 DenseSet<uint32_t> ContextIdsToMove) {
3114 // NewCallee and Edge's current callee must be clones of the same original
3115 // node (Edge's current callee may be the original node too).
3116 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3117
3118 ContextNode *OldCallee = Edge->Callee;
3119
3120 // We might already have an edge to the new callee from earlier cloning for a
3121 // different allocation. If one exists we will reuse it.
3122 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3123
3124 // Callers will pass an empty ContextIdsToMove set when they want to move the
3125 // edge. Copy in Edge's ids for simplicity.
3126 if (ContextIdsToMove.empty())
3127 ContextIdsToMove = Edge->getContextIds();
3128
3129 // If we are moving all of Edge's ids, then just move the whole Edge.
3130 // Otherwise only move the specified subset, to a new edge if needed.
3131 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3132 // First, update the alloc types on New Callee from Edge.
3133 // Do this before we potentially clear Edge's fields below!
3134 NewCallee->AllocTypes |= Edge->AllocTypes;
3135 // Moving the whole Edge.
3136 if (ExistingEdgeToNewCallee) {
3137 // Since we already have an edge to NewCallee, simply move the ids
3138 // onto it, and remove the existing Edge.
3139 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3140 ContextIdsToMove.end());
3141 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3142 assert(Edge->ContextIds == ContextIdsToMove);
3143 removeEdgeFromGraph(Edge.get());
3144 } else {
3145 // Otherwise just reconnect Edge to NewCallee.
3146 Edge->Callee = NewCallee;
3147 NewCallee->CallerEdges.push_back(Edge);
3148 // Remove it from callee where it was previously connected.
3149 OldCallee->eraseCallerEdge(Edge.get());
3150 // Don't need to update Edge's context ids since we are simply
3151 // reconnecting it.
3152 }
3153 } else {
3154 // Only moving a subset of Edge's ids.
3155 // Compute the alloc type of the subset of ids being moved.
3156 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3157 if (ExistingEdgeToNewCallee) {
3158 // Since we already have an edge to NewCallee, simply move the ids
3159 // onto it.
3160 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3161 ContextIdsToMove.end());
3162 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3163 } else {
3164 // Otherwise, create a new edge to NewCallee for the ids being moved.
3165 auto NewEdge = std::make_shared<ContextEdge>(
3166 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3167 Edge->Caller->CalleeEdges.push_back(NewEdge);
3168 NewCallee->CallerEdges.push_back(NewEdge);
3169 }
3170 // In either case, need to update the alloc types on NewCallee, and remove
3171 // those ids and update the alloc type on the original Edge.
3172 NewCallee->AllocTypes |= CallerEdgeAllocType;
3173 set_subtract(Edge->ContextIds, ContextIdsToMove);
3174 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3175 }
3176 // Now walk the old callee node's callee edges and move Edge's context ids
3177 // over to the corresponding edge into the clone (which is created here if
3178 // this is a newly created clone).
3179 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3180 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3181 // If this is a direct recursion edge, use NewCallee (the clone) as the
3182 // callee as well, so that any edge updated/created here is also direct
3183 // recursive.
3184 if (CalleeToUse == OldCallee)
3185 CalleeToUse = NewCallee;
3186 // The context ids moving to the new callee are the subset of this edge's
3187 // context ids and the context ids on the caller edge being moved.
3188 DenseSet<uint32_t> EdgeContextIdsToMove =
3189 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3190 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3191 OldCalleeEdge->AllocTypes =
3192 computeAllocType(OldCalleeEdge->getContextIds());
3193 if (!NewClone) {
3194 // Update context ids / alloc type on corresponding edge to NewCallee.
3195 // There is a chance this may not exist if we are reusing an existing
3196 // clone, specifically during function assignment, where we would have
3197 // removed none type edges after creating the clone. If we can't find
3198 // a corresponding edge there, fall through to the cloning below.
3199 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3200 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3201 EdgeContextIdsToMove.end());
3202 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3203 continue;
3204 }
3205 }
3206 auto NewEdge = std::make_shared<ContextEdge>(
3207 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3208 EdgeContextIdsToMove);
3209 NewCallee->CalleeEdges.push_back(NewEdge);
3210 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3211 }
3212 // Recompute the node alloc type now that its callee edges have been
3213 // updated (since we will compute from those edges).
3214 OldCallee->AllocTypes = OldCallee->computeAllocType();
3215 // OldCallee alloc type should be None iff its context id set is now empty.
3216 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3217 OldCallee->emptyContextIds());
3218 if (VerifyCCG) {
3219 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3220 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3221 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3222 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3223 /*CheckEdges=*/false);
3224 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3225 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3226 /*CheckEdges=*/false);
3227 }
3228}
3229
3230template <typename DerivedCCG, typename FuncTy, typename CallTy>
3231void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3232 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3233 ContextNode *NewCaller) {
3234 auto *OldCallee = Edge->Callee;
3235 auto *NewCallee = OldCallee;
3236 // If this edge was direct recursive, make any new/updated edge also direct
3237 // recursive to NewCaller.
3238 bool Recursive = Edge->Caller == Edge->Callee;
3239 if (Recursive)
3240 NewCallee = NewCaller;
3241
3242 ContextNode *OldCaller = Edge->Caller;
3243 OldCaller->eraseCalleeEdge(Edge.get());
3244
3245 // We might already have an edge to the new caller. If one exists we will
3246 // reuse it.
3247 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3248
3249 if (ExistingEdgeToNewCaller) {
3250 // Since we already have an edge to NewCaller, simply move the ids
3251 // onto it, and remove the existing Edge.
3252 ExistingEdgeToNewCaller->getContextIds().insert(
3253 Edge->getContextIds().begin(), Edge->getContextIds().end());
3254 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3255 Edge->ContextIds.clear();
3256 Edge->AllocTypes = (uint8_t)AllocationType::None;
3257 OldCallee->eraseCallerEdge(Edge.get());
3258 } else {
3259 // Otherwise just reconnect Edge to NewCaller.
3260 Edge->Caller = NewCaller;
3261 NewCaller->CalleeEdges.push_back(Edge);
3262 if (Recursive) {
3263 assert(NewCallee == NewCaller);
3264 // In the case of (direct) recursive edges, we update the callee as well
3265 // so that it becomes recursive on the new caller.
3266 Edge->Callee = NewCallee;
3267 NewCallee->CallerEdges.push_back(Edge);
3268 OldCallee->eraseCallerEdge(Edge.get());
3269 }
3270 // Don't need to update Edge's context ids since we are simply
3271 // reconnecting it.
3272 }
3273 // In either case, need to update the alloc types on New Caller.
3274 NewCaller->AllocTypes |= Edge->AllocTypes;
3275
3276 // Now walk the old caller node's caller edges and move Edge's context ids
3277 // over to the corresponding edge into the node (which is created here if
3278 // this is a newly created node). We can tell whether this is a newly created
3279 // node by seeing if it has any caller edges yet.
3280#ifndef NDEBUG
3281 bool IsNewNode = NewCaller->CallerEdges.empty();
3282#endif
3283 // If we just moved a direct recursive edge, presumably its context ids should
3284 // also flow out of OldCaller via some other non-recursive callee edge. We
3285 // don't want to remove the recursive context ids from other caller edges yet,
3286 // otherwise the context ids get into an inconsistent state on OldCaller.
3287 // We will update these context ids on the non-recursive caller edge when and
3288 // if they are updated on the non-recursive callee.
3289 if (!Recursive) {
3290 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3291 auto OldCallerCaller = OldCallerEdge->Caller;
3292 // The context ids moving to the new caller are the subset of this edge's
3293 // context ids and the context ids on the callee edge being moved.
3294 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3295 OldCallerEdge->getContextIds(), Edge->getContextIds());
3296 if (OldCaller == OldCallerCaller) {
3297 OldCallerCaller = NewCaller;
3298 // Don't actually move this one. The caller will move it directly via a
3299 // call to this function with this as the Edge if it is appropriate to
3300 // move to a diff node that has a matching callee (itself).
3301 continue;
3302 }
3303 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3304 OldCallerEdge->AllocTypes =
3305 computeAllocType(OldCallerEdge->getContextIds());
3306 // In this function we expect that any pre-existing node already has edges
3307 // from the same callers as the old node. That should be true in the
3308 // current use case, where we will remove None-type edges after copying
3309 // over all caller edges from the callee.
3310 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3311 // Since we would have skipped caller edges when moving a direct recursive
3312 // edge, this may not hold true when recursive handling enabled.
3313 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3314 if (ExistingCallerEdge) {
3315 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3316 EdgeContextIdsToMove.end());
3317 ExistingCallerEdge->AllocTypes |=
3318 computeAllocType(EdgeContextIdsToMove);
3319 continue;
3320 }
3321 auto NewEdge = std::make_shared<ContextEdge>(
3322 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3323 EdgeContextIdsToMove);
3324 NewCaller->CallerEdges.push_back(NewEdge);
3325 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3326 }
3327 }
3328 // Recompute the node alloc type now that its caller edges have been
3329 // updated (since we will compute from those edges).
3330 OldCaller->AllocTypes = OldCaller->computeAllocType();
3331 // OldCaller alloc type should be None iff its context id set is now empty.
3332 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3333 OldCaller->emptyContextIds());
3334 if (VerifyCCG) {
3335 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3336 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3337 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3338 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3339 /*CheckEdges=*/false);
3340 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3341 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3342 /*CheckEdges=*/false);
3343 }
3344}
3345
3346template <typename DerivedCCG, typename FuncTy, typename CallTy>
3347void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3348 recursivelyRemoveNoneTypeCalleeEdges(
3349 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3350 auto Inserted = Visited.insert(Node);
3351 if (!Inserted.second)
3352 return;
3353
3354 removeNoneTypeCalleeEdges(Node);
3355
3356 for (auto *Clone : Node->Clones)
3357 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3358
3359 // The recursive call may remove some of this Node's caller edges.
3360 // Iterate over a copy and skip any that were removed.
3361 auto CallerEdges = Node->CallerEdges;
3362 for (auto &Edge : CallerEdges) {
3363 // Skip any that have been removed by an earlier recursive call.
3364 if (Edge->isRemoved()) {
3365 assert(!is_contained(Node->CallerEdges, Edge));
3366 continue;
3367 }
3368 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3369 }
3370}
3371
3372template <typename DerivedCCG, typename FuncTy, typename CallTy>
3373void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3375 for (auto &Entry : AllocationCallToContextNodeMap) {
3376 Visited.clear();
3377 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3378 }
3379 Visited.clear();
3380 for (auto &Entry : AllocationCallToContextNodeMap)
3381 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3382 if (VerifyCCG)
3383 check();
3384}
3385
3386// helper function to check an AllocType is cold or notcold or both.
3388 return (AllocType == (uint8_t)AllocationType::Cold) ||
3390 (AllocType ==
3392}
3393
3394template <typename DerivedCCG, typename FuncTy, typename CallTy>
3395void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3396 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3397 const DenseSet<uint32_t> &AllocContextIds) {
3398 if (VerifyNodes)
3399 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3400 assert(!Node->CloneOf);
3401
3402 // If Node as a null call, then either it wasn't found in the module (regular
3403 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3404 // cloning (e.g. recursion, calls multiple targets, etc).
3405 // Do this here so that we don't try to recursively clone callers below, which
3406 // isn't useful at least for this node.
3407 if (!Node->hasCall())
3408 return;
3409
3410 // No need to look at any callers if allocation type already unambiguous.
3411 if (hasSingleAllocType(Node->AllocTypes))
3412 return;
3413
3414#ifndef NDEBUG
3415 auto Insert =
3416#endif
3417 Visited.insert(Node);
3418 // We should not have visited this node yet.
3419 assert(Insert.second);
3420 // The recursive call to identifyClones may delete the current edge from the
3421 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3422 // in an iterator and having recursive call erase from it. Other edges may
3423 // also get removed during the recursion, which will have null Callee and
3424 // Caller pointers (and are deleted later), so we skip those below.
3425 {
3426 auto CallerEdges = Node->CallerEdges;
3427 for (auto &Edge : CallerEdges) {
3428 // Skip any that have been removed by an earlier recursive call.
3429 if (Edge->isRemoved()) {
3430 assert(!is_contained(Node->CallerEdges, Edge));
3431 continue;
3432 }
3433 // Ignore any caller we previously visited via another edge.
3434 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3435 identifyClones(Edge->Caller, Visited, AllocContextIds);
3436 }
3437 }
3438 }
3439
3440 // Check if we reached an unambiguous call or have have only a single caller.
3441 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3442 return;
3443
3444 // We need to clone.
3445
3446 // Try to keep the original version as alloc type NotCold. This will make
3447 // cases with indirect calls or any other situation with an unknown call to
3448 // the original function get the default behavior. We do this by sorting the
3449 // CallerEdges of the Node we will clone by alloc type.
3450 //
3451 // Give NotCold edge the lowest sort priority so those edges are at the end of
3452 // the caller edges vector, and stay on the original version (since the below
3453 // code clones greedily until it finds all remaining edges have the same type
3454 // and leaves the remaining ones on the original Node).
3455 //
3456 // We shouldn't actually have any None type edges, so the sorting priority for
3457 // that is arbitrary, and we assert in that case below.
3458 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3459 /*Cold*/ 1,
3460 /*NotColdCold*/ 2};
3461 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3462 [&](const std::shared_ptr<ContextEdge> &A,
3463 const std::shared_ptr<ContextEdge> &B) {
3464 // Nodes with non-empty context ids should be sorted before
3465 // those with empty context ids.
3466 if (A->ContextIds.empty())
3467 // Either B ContextIds are non-empty (in which case we
3468 // should return false because B < A), or B ContextIds
3469 // are empty, in which case they are equal, and we should
3470 // maintain the original relative ordering.
3471 return false;
3472 if (B->ContextIds.empty())
3473 return true;
3474
3475 if (A->AllocTypes == B->AllocTypes)
3476 // Use the first context id for each edge as a
3477 // tie-breaker.
3478 return *A->ContextIds.begin() < *B->ContextIds.begin();
3479 return AllocTypeCloningPriority[A->AllocTypes] <
3480 AllocTypeCloningPriority[B->AllocTypes];
3481 });
3482
3483 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3484
3485 DenseSet<uint32_t> RecursiveContextIds;
3486 // If we are allowing recursive callsites, but have also disabled recursive
3487 // contexts, look for context ids that show up in multiple caller edges.
3489 DenseSet<uint32_t> AllCallerContextIds;
3490 for (auto &CE : Node->CallerEdges) {
3491 // Resize to the largest set of caller context ids, since we know the
3492 // final set will be at least that large.
3493 AllCallerContextIds.reserve(CE->getContextIds().size());
3494 for (auto Id : CE->getContextIds())
3495 if (!AllCallerContextIds.insert(Id).second)
3496 RecursiveContextIds.insert(Id);
3497 }
3498 }
3499
3500 // Iterate until we find no more opportunities for disambiguating the alloc
3501 // types via cloning. In most cases this loop will terminate once the Node
3502 // has a single allocation type, in which case no more cloning is needed.
3503 // Iterate over a copy of Node's caller edges, since we may need to remove
3504 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3505 // makes it less error-prone.
3506 auto CallerEdges = Node->CallerEdges;
3507 for (auto &CallerEdge : CallerEdges) {
3508 // See if cloning the prior caller edge left this node with a single alloc
3509 // type or a single caller. In that case no more cloning of Node is needed.
3510 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3511 break;
3512
3513 // If the caller was not successfully matched to a call in the IR/summary,
3514 // there is no point in trying to clone for it as we can't update that call.
3515 if (!CallerEdge->Caller->hasCall())
3516 continue;
3517
3518 // Only need to process the ids along this edge pertaining to the given
3519 // allocation.
3520 auto CallerEdgeContextsForAlloc =
3521 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3522 if (!RecursiveContextIds.empty())
3523 CallerEdgeContextsForAlloc =
3524 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3525 if (CallerEdgeContextsForAlloc.empty())
3526 continue;
3527
3528 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3529
3530 // Compute the node callee edge alloc types corresponding to the context ids
3531 // for this caller edge.
3532 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3533 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3534 for (auto &CalleeEdge : Node->CalleeEdges)
3535 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3536 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3537
3538 // Don't clone if doing so will not disambiguate any alloc types amongst
3539 // caller edges (including the callee edges that would be cloned).
3540 // Otherwise we will simply move all edges to the clone.
3541 //
3542 // First check if by cloning we will disambiguate the caller allocation
3543 // type from node's allocation type. Query allocTypeToUse so that we don't
3544 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3545 // neither of these should be None type.
3546 //
3547 // Then check if by cloning node at least one of the callee edges will be
3548 // disambiguated by splitting out different context ids.
3549 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3550 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3551 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3552 allocTypeToUse(Node->AllocTypes) &&
3553 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3555 continue;
3556
3557 // First see if we can use an existing clone. Check each clone and its
3558 // callee edges for matching alloc types.
3559 ContextNode *Clone = nullptr;
3560 for (auto *CurClone : Node->Clones) {
3561 if (allocTypeToUse(CurClone->AllocTypes) !=
3562 allocTypeToUse(CallerAllocTypeForAlloc))
3563 continue;
3564
3565 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3566 hasSingleAllocType(CallerAllocTypeForAlloc);
3567 // The above check should mean that if both have single alloc types that
3568 // they should be equal.
3569 assert(!BothSingleAlloc ||
3570 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3571
3572 // If either both have a single alloc type (which are the same), or if the
3573 // clone's callee edges have the same alloc types as those for the current
3574 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3575 // then we can reuse this clone.
3576 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3577 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3578 Clone = CurClone;
3579 break;
3580 }
3581 }
3582
3583 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3584 if (Clone)
3585 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3586 CallerEdgeContextsForAlloc);
3587 else
3588 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3589
3590 // Sanity check that no alloc types on clone or its edges are None.
3591 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3592 }
3593
3594 // We should still have some context ids on the original Node.
3595 assert(!Node->emptyContextIds());
3596
3597 // Sanity check that no alloc types on node or edges are None.
3598 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3599
3600 if (VerifyNodes)
3601 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3602}
3603
3604void ModuleCallsiteContextGraph::updateAllocationCall(
3606 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3607 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3608 "memprof", AllocTypeString);
3609 cast<CallBase>(Call.call())->addFnAttr(A);
3610 OREGetter(Call.call()->getFunction())
3611 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3612 << ore::NV("AllocationCall", Call.call()) << " in clone "
3613 << ore::NV("Caller", Call.call()->getFunction())
3614 << " marked with memprof allocation attribute "
3615 << ore::NV("Attribute", AllocTypeString));
3616}
3617
3618void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3620 auto *AI = cast<AllocInfo *>(Call.call());
3621 assert(AI);
3622 assert(AI->Versions.size() > Call.cloneNo());
3623 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3624}
3625
3627ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3628 const auto *CB = cast<CallBase>(Call.call());
3629 if (!CB->getAttributes().hasFnAttr("memprof"))
3630 return AllocationType::None;
3631 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3634}
3635
3637IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3638 const auto *AI = cast<AllocInfo *>(Call.call());
3639 assert(AI->Versions.size() > Call.cloneNo());
3640 return (AllocationType)AI->Versions[Call.cloneNo()];
3641}
3642
3643void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3644 FuncInfo CalleeFunc) {
3645 if (CalleeFunc.cloneNo() > 0)
3646 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3647 OREGetter(CallerCall.call()->getFunction())
3648 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3649 << ore::NV("Call", CallerCall.call()) << " in clone "
3650 << ore::NV("Caller", CallerCall.call()->getFunction())
3651 << " assigned to call function clone "
3652 << ore::NV("Callee", CalleeFunc.func()));
3653}
3654
3655void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3656 FuncInfo CalleeFunc) {
3657 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
3658 assert(CI &&
3659 "Caller cannot be an allocation which should not have profiled calls");
3660 assert(CI->Clones.size() > CallerCall.cloneNo());
3661 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3662}
3663
3664CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3665 Instruction *>::FuncInfo
3666ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3667 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3668 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3669 // Use existing LLVM facilities for cloning and obtaining Call in clone
3670 ValueToValueMapTy VMap;
3671 auto *NewFunc = CloneFunction(Func.func(), VMap);
3672 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3673 assert(!Func.func()->getParent()->getFunction(Name));
3674 NewFunc->setName(Name);
3675 for (auto &Inst : CallsWithMetadataInFunc) {
3676 // This map always has the initial version in it.
3677 assert(Inst.cloneNo() == 0);
3678 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3679 }
3680 OREGetter(Func.func())
3681 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3682 << "created clone " << ore::NV("NewFunction", NewFunc));
3683 return {NewFunc, CloneNo};
3684}
3685
3686CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3687 IndexCall>::FuncInfo
3688IndexCallsiteContextGraph::cloneFunctionForCallsite(
3689 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3690 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3691 // Check how many clones we have of Call (and therefore function).
3692 // The next clone number is the current size of versions array.
3693 // Confirm this matches the CloneNo provided by the caller, which is based on
3694 // the number of function clones we have.
3695 assert(CloneNo == (isa<AllocInfo *>(Call.call())
3696 ? cast<AllocInfo *>(Call.call())->Versions.size()
3697 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
3698 // Walk all the instructions in this function. Create a new version for
3699 // each (by adding an entry to the Versions/Clones summary array), and copy
3700 // over the version being called for the function clone being cloned here.
3701 // Additionally, add an entry to the CallMap for the new function clone,
3702 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3703 // to the new call clone.
3704 for (auto &Inst : CallsWithMetadataInFunc) {
3705 // This map always has the initial version in it.
3706 assert(Inst.cloneNo() == 0);
3707 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
3708 assert(AI->Versions.size() == CloneNo);
3709 // We assign the allocation type later (in updateAllocationCall), just add
3710 // an entry for it here.
3711 AI->Versions.push_back(0);
3712 } else {
3713 auto *CI = cast<CallsiteInfo *>(Inst.call());
3714 assert(CI && CI->Clones.size() == CloneNo);
3715 // We assign the clone number later (in updateCall), just add an entry for
3716 // it here.
3717 CI->Clones.push_back(0);
3718 }
3719 CallMap[Inst] = {Inst.call(), CloneNo};
3720 }
3721 return {Func.func(), CloneNo};
3722}
3723
3724// This method assigns cloned callsites to functions, cloning the functions as
3725// needed. The assignment is greedy and proceeds roughly as follows:
3726//
3727// For each function Func:
3728// For each call with graph Node having clones:
3729// Initialize ClonesWorklist to Node and its clones
3730// Initialize NodeCloneCount to 0
3731// While ClonesWorklist is not empty:
3732// Clone = pop front ClonesWorklist
3733// NodeCloneCount++
3734// If Func has been cloned less than NodeCloneCount times:
3735// If NodeCloneCount is 1:
3736// Assign Clone to original Func
3737// Continue
3738// Create a new function clone
3739// If other callers not assigned to call a function clone yet:
3740// Assign them to call new function clone
3741// Continue
3742// Assign any other caller calling the cloned version to new clone
3743//
3744// For each caller of Clone:
3745// If caller is assigned to call a specific function clone:
3746// If we cannot assign Clone to that function clone:
3747// Create new callsite Clone NewClone
3748// Add NewClone to ClonesWorklist
3749// Continue
3750// Assign Clone to existing caller's called function clone
3751// Else:
3752// If Clone not already assigned to a function clone:
3753// Assign to first function clone without assignment
3754// Assign caller to selected function clone
3755template <typename DerivedCCG, typename FuncTy, typename CallTy>
3756bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3757 bool Changed = false;
3758
3759 // Keep track of the assignment of nodes (callsites) to function clones they
3760 // call.
3761 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3762
3763 // Update caller node to call function version CalleeFunc, by recording the
3764 // assignment in CallsiteToCalleeFuncCloneMap.
3765 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3766 const FuncInfo &CalleeFunc) {
3767 assert(Caller->hasCall());
3768 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3769 };
3770
3771 // Walk all functions for which we saw calls with memprof metadata, and handle
3772 // cloning for each of its calls.
3773 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3774 FuncInfo OrigFunc(Func);
3775 // Map from each clone of OrigFunc to a map of remappings of each call of
3776 // interest (from original uncloned call to the corresponding cloned call in
3777 // that function clone).
3778 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3779 for (auto &Call : CallsWithMetadata) {
3780 ContextNode *Node = getNodeForInst(Call);
3781 // Skip call if we do not have a node for it (all uses of its stack ids
3782 // were either on inlined chains or pruned from the MIBs), or if we did
3783 // not create any clones for it.
3784 if (!Node || Node->Clones.empty())
3785 continue;
3786 assert(Node->hasCall() &&
3787 "Not having a call should have prevented cloning");
3788
3789 // Track the assignment of function clones to clones of the current
3790 // callsite Node being handled.
3791 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3792
3793 // Assign callsite version CallsiteClone to function version FuncClone,
3794 // and also assign (possibly cloned) Call to CallsiteClone.
3795 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3796 CallInfo &Call,
3797 ContextNode *CallsiteClone,
3798 bool IsAlloc) {
3799 // Record the clone of callsite node assigned to this function clone.
3800 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3801
3802 assert(FuncClonesToCallMap.count(FuncClone));
3803 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3804 CallInfo CallClone(Call);
3805 if (CallMap.count(Call))
3806 CallClone = CallMap[Call];
3807 CallsiteClone->setCall(CallClone);
3808 // Need to do the same for all matching calls.
3809 for (auto &MatchingCall : Node->MatchingCalls) {
3810 CallInfo CallClone(MatchingCall);
3811 if (CallMap.count(MatchingCall))
3812 CallClone = CallMap[MatchingCall];
3813 // Updates the call in the list.
3814 MatchingCall = CallClone;
3815 }
3816 };
3817
3818 // Keep track of the clones of callsite Node that need to be assigned to
3819 // function clones. This list may be expanded in the loop body below if we
3820 // find additional cloning is required.
3821 std::deque<ContextNode *> ClonesWorklist;
3822 // Ignore original Node if we moved all of its contexts to clones.
3823 if (!Node->emptyContextIds())
3824 ClonesWorklist.push_back(Node);
3825 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3826 Node->Clones.end());
3827
3828 // Now walk through all of the clones of this callsite Node that we need,
3829 // and determine the assignment to a corresponding clone of the current
3830 // function (creating new function clones as needed).
3831 unsigned NodeCloneCount = 0;
3832 while (!ClonesWorklist.empty()) {
3833 ContextNode *Clone = ClonesWorklist.front();
3834 ClonesWorklist.pop_front();
3835 NodeCloneCount++;
3836 if (VerifyNodes)
3837 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3838
3839 // Need to create a new function clone if we have more callsite clones
3840 // than existing function clones, which would have been assigned to an
3841 // earlier clone in the list (we assign callsite clones to function
3842 // clones greedily).
3843 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3844 // If this is the first callsite copy, assign to original function.
3845 if (NodeCloneCount == 1) {
3846 // Since FuncClonesToCallMap is empty in this case, no clones have
3847 // been created for this function yet, and no callers should have
3848 // been assigned a function clone for this callee node yet.
3850 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3851 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3852 }));
3853 // Initialize with empty call map, assign Clone to original function
3854 // and its callers, and skip to the next clone.
3855 FuncClonesToCallMap[OrigFunc] = {};
3856 AssignCallsiteCloneToFuncClone(
3857 OrigFunc, Call, Clone,
3858 AllocationCallToContextNodeMap.count(Call));
3859 for (auto &CE : Clone->CallerEdges) {
3860 // Ignore any caller that does not have a recorded callsite Call.
3861 if (!CE->Caller->hasCall())
3862 continue;
3863 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3864 }
3865 continue;
3866 }
3867
3868 // First locate which copy of OrigFunc to clone again. If a caller
3869 // of this callsite clone was already assigned to call a particular
3870 // function clone, we need to redirect all of those callers to the
3871 // new function clone, and update their other callees within this
3872 // function.
3873 FuncInfo PreviousAssignedFuncClone;
3874 auto EI = llvm::find_if(
3875 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3876 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3877 });
3878 bool CallerAssignedToCloneOfFunc = false;
3879 if (EI != Clone->CallerEdges.end()) {
3880 const std::shared_ptr<ContextEdge> &Edge = *EI;
3881 PreviousAssignedFuncClone =
3882 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3883 CallerAssignedToCloneOfFunc = true;
3884 }
3885
3886 // Clone function and save it along with the CallInfo map created
3887 // during cloning in the FuncClonesToCallMap.
3888 std::map<CallInfo, CallInfo> NewCallMap;
3889 unsigned CloneNo = FuncClonesToCallMap.size();
3890 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3891 "should already exist in the map");
3892 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3893 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3894 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3895 FunctionClonesAnalysis++;
3896 Changed = true;
3897
3898 // If no caller callsites were already assigned to a clone of this
3899 // function, we can simply assign this clone to the new func clone
3900 // and update all callers to it, then skip to the next clone.
3901 if (!CallerAssignedToCloneOfFunc) {
3902 AssignCallsiteCloneToFuncClone(
3903 NewFuncClone, Call, Clone,
3904 AllocationCallToContextNodeMap.count(Call));
3905 for (auto &CE : Clone->CallerEdges) {
3906 // Ignore any caller that does not have a recorded callsite Call.
3907 if (!CE->Caller->hasCall())
3908 continue;
3909 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3910 }
3911 continue;
3912 }
3913
3914 // We may need to do additional node cloning in this case.
3915 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3916 // that were previously assigned to call PreviousAssignedFuncClone,
3917 // to record that they now call NewFuncClone.
3918 // The none type edge removal may remove some of this Clone's caller
3919 // edges, if it is reached via another of its caller's callees.
3920 // Iterate over a copy and skip any that were removed.
3921 auto CallerEdges = Clone->CallerEdges;
3922 for (auto CE : CallerEdges) {
3923 // Skip any that have been removed on an earlier iteration.
3924 if (CE->isRemoved()) {
3925 assert(!is_contained(Clone->CallerEdges, CE));
3926 continue;
3927 }
3928 assert(CE);
3929 // Ignore any caller that does not have a recorded callsite Call.
3930 if (!CE->Caller->hasCall())
3931 continue;
3932
3933 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3934 // We subsequently fall through to later handling that
3935 // will perform any additional cloning required for
3936 // callers that were calling other function clones.
3937 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3938 PreviousAssignedFuncClone)
3939 continue;
3940
3941 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3942
3943 // If we are cloning a function that was already assigned to some
3944 // callers, then essentially we are creating new callsite clones
3945 // of the other callsites in that function that are reached by those
3946 // callers. Clone the other callees of the current callsite's caller
3947 // that were already assigned to PreviousAssignedFuncClone
3948 // accordingly. This is important since we subsequently update the
3949 // calls from the nodes in the graph and their assignments to callee
3950 // functions recorded in CallsiteToCalleeFuncCloneMap.
3951 // The none type edge removal may remove some of this caller's
3952 // callee edges, if it is reached via another of its callees.
3953 // Iterate over a copy and skip any that were removed.
3954 auto CalleeEdges = CE->Caller->CalleeEdges;
3955 for (auto CalleeEdge : CalleeEdges) {
3956 // Skip any that have been removed on an earlier iteration when
3957 // cleaning up newly None type callee edges.
3958 if (CalleeEdge->isRemoved()) {
3959 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3960 continue;
3961 }
3962 assert(CalleeEdge);
3963 ContextNode *Callee = CalleeEdge->Callee;
3964 // Skip the current callsite, we are looking for other
3965 // callsites Caller calls, as well as any that does not have a
3966 // recorded callsite Call.
3967 if (Callee == Clone || !Callee->hasCall())
3968 continue;
3969 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3970 removeNoneTypeCalleeEdges(NewClone);
3971 // Moving the edge may have resulted in some none type
3972 // callee edges on the original Callee.
3973 removeNoneTypeCalleeEdges(Callee);
3974 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3975 // If the Callee node was already assigned to call a specific
3976 // function version, make sure its new clone is assigned to call
3977 // that same function clone.
3978 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3979 RecordCalleeFuncOfCallsite(
3980 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3981 // Update NewClone with the new Call clone of this callsite's Call
3982 // created for the new function clone created earlier.
3983 // Recall that we have already ensured when building the graph
3984 // that each caller can only call callsites within the same
3985 // function, so we are guaranteed that Callee Call is in the
3986 // current OrigFunc.
3987 // CallMap is set up as indexed by original Call at clone 0.
3988 CallInfo OrigCall(Callee->getOrigNode()->Call);
3989 OrigCall.setCloneNo(0);
3990 std::map<CallInfo, CallInfo> &CallMap =
3991 FuncClonesToCallMap[NewFuncClone];
3992 assert(CallMap.count(OrigCall));
3993 CallInfo NewCall(CallMap[OrigCall]);
3994 assert(NewCall);
3995 NewClone->setCall(NewCall);
3996 // Need to do the same for all matching calls.
3997 for (auto &MatchingCall : NewClone->MatchingCalls) {
3998 CallInfo OrigMatchingCall(MatchingCall);
3999 OrigMatchingCall.setCloneNo(0);
4000 assert(CallMap.count(OrigMatchingCall));
4001 CallInfo NewCall(CallMap[OrigMatchingCall]);
4002 assert(NewCall);
4003 // Updates the call in the list.
4004 MatchingCall = NewCall;
4005 }
4006 }
4007 }
4008 // Fall through to handling below to perform the recording of the
4009 // function for this callsite clone. This enables handling of cases
4010 // where the callers were assigned to different clones of a function.
4011 }
4012
4013 // See if we can use existing function clone. Walk through
4014 // all caller edges to see if any have already been assigned to
4015 // a clone of this callsite's function. If we can use it, do so. If not,
4016 // because that function clone is already assigned to a different clone
4017 // of this callsite, then we need to clone again.
4018 // Basically, this checking is needed to handle the case where different
4019 // caller functions/callsites may need versions of this function
4020 // containing different mixes of callsite clones across the different
4021 // callsites within the function. If that happens, we need to create
4022 // additional function clones to handle the various combinations.
4023 //
4024 // Keep track of any new clones of this callsite created by the
4025 // following loop, as well as any existing clone that we decided to
4026 // assign this clone to.
4027 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4028 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4029 // Iterate over a copy of Clone's caller edges, since we may need to
4030 // remove edges in the moveEdgeTo* methods, and this simplifies the
4031 // handling and makes it less error-prone.
4032 auto CloneCallerEdges = Clone->CallerEdges;
4033 for (auto &Edge : CloneCallerEdges) {
4034 // Skip removed edges (due to direct recursive edges updated when
4035 // updating callee edges when moving an edge and subsequently
4036 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4037 if (Edge->isRemoved())
4038 continue;
4039 // Ignore any caller that does not have a recorded callsite Call.
4040 if (!Edge->Caller->hasCall())
4041 continue;
4042 // If this caller already assigned to call a version of OrigFunc, need
4043 // to ensure we can assign this callsite clone to that function clone.
4044 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4045 FuncInfo FuncCloneCalledByCaller =
4046 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4047 // First we need to confirm that this function clone is available
4048 // for use by this callsite node clone.
4049 //
4050 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4051 // its callsite clones, one of those callsite clones X could have
4052 // been assigned to the same function clone called by Edge's caller
4053 // - if Edge's caller calls another callsite within Node's original
4054 // function, and that callsite has another caller reaching clone X.
4055 // We need to clone Node again in this case.
4056 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4057 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4058 Clone) ||
4059 // Detect when we have multiple callers of this callsite that
4060 // have already been assigned to specific, and different, clones
4061 // of OrigFunc (due to other unrelated callsites in Func they
4062 // reach via call contexts). Is this Clone of callsite Node
4063 // assigned to a different clone of OrigFunc? If so, clone Node
4064 // again.
4065 (FuncCloneAssignedToCurCallsiteClone &&
4066 FuncCloneAssignedToCurCallsiteClone !=
4067 FuncCloneCalledByCaller)) {
4068 // We need to use a different newly created callsite clone, in
4069 // order to assign it to another new function clone on a
4070 // subsequent iteration over the Clones array (adjusted below).
4071 // Note we specifically do not reset the
4072 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4073 // when this new clone is processed later we know which version of
4074 // the function to copy (so that other callsite clones we have
4075 // assigned to that function clone are properly cloned over). See
4076 // comments in the function cloning handling earlier.
4077
4078 // Check if we already have cloned this callsite again while
4079 // walking through caller edges, for a caller calling the same
4080 // function clone. If so, we can move this edge to that new clone
4081 // rather than creating yet another new clone.
4082 if (FuncCloneToNewCallsiteCloneMap.count(
4083 FuncCloneCalledByCaller)) {
4084 ContextNode *NewClone =
4085 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4086 moveEdgeToExistingCalleeClone(Edge, NewClone);
4087 // Cleanup any none type edges cloned over.
4088 removeNoneTypeCalleeEdges(NewClone);
4089 } else {
4090 // Create a new callsite clone.
4091 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4092 removeNoneTypeCalleeEdges(NewClone);
4093 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4094 NewClone;
4095 // Add to list of clones and process later.
4096 ClonesWorklist.push_back(NewClone);
4097 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4098 }
4099 // Moving the caller edge may have resulted in some none type
4100 // callee edges.
4101 removeNoneTypeCalleeEdges(Clone);
4102 // We will handle the newly created callsite clone in a subsequent
4103 // iteration over this Node's Clones.
4104 continue;
4105 }
4106
4107 // Otherwise, we can use the function clone already assigned to this
4108 // caller.
4109 if (!FuncCloneAssignedToCurCallsiteClone) {
4110 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4111 // Assign Clone to FuncCloneCalledByCaller
4112 AssignCallsiteCloneToFuncClone(
4113 FuncCloneCalledByCaller, Call, Clone,
4114 AllocationCallToContextNodeMap.count(Call));
4115 } else
4116 // Don't need to do anything - callsite is already calling this
4117 // function clone.
4118 assert(FuncCloneAssignedToCurCallsiteClone ==
4119 FuncCloneCalledByCaller);
4120
4121 } else {
4122 // We have not already assigned this caller to a version of
4123 // OrigFunc. Do the assignment now.
4124
4125 // First check if we have already assigned this callsite clone to a
4126 // clone of OrigFunc for another caller during this iteration over
4127 // its caller edges.
4128 if (!FuncCloneAssignedToCurCallsiteClone) {
4129 // Find first function in FuncClonesToCallMap without an assigned
4130 // clone of this callsite Node. We should always have one
4131 // available at this point due to the earlier cloning when the
4132 // FuncClonesToCallMap size was smaller than the clone number.
4133 for (auto &CF : FuncClonesToCallMap) {
4134 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4135 FuncCloneAssignedToCurCallsiteClone = CF.first;
4136 break;
4137 }
4138 }
4139 assert(FuncCloneAssignedToCurCallsiteClone);
4140 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4141 AssignCallsiteCloneToFuncClone(
4142 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4143 AllocationCallToContextNodeMap.count(Call));
4144 } else
4145 assert(FuncCloneToCurNodeCloneMap
4146 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4147 // Update callers to record function version called.
4148 RecordCalleeFuncOfCallsite(Edge->Caller,
4149 FuncCloneAssignedToCurCallsiteClone);
4150 }
4151 }
4152 }
4153 if (VerifyCCG) {
4154 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4155 for (const auto &PE : Node->CalleeEdges)
4156 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4157 for (const auto &CE : Node->CallerEdges)
4158 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4159 for (auto *Clone : Node->Clones) {
4160 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4161 for (const auto &PE : Clone->CalleeEdges)
4162 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4163 for (const auto &CE : Clone->CallerEdges)
4164 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4165 }
4166 }
4167 }
4168 }
4169
4170 uint8_t BothTypes =
4172
4173 auto UpdateCalls = [&](ContextNode *Node,
4175 auto &&UpdateCalls) {
4176 auto Inserted = Visited.insert(Node);
4177 if (!Inserted.second)
4178 return;
4179
4180 for (auto *Clone : Node->Clones)
4181 UpdateCalls(Clone, Visited, UpdateCalls);
4182
4183 for (auto &Edge : Node->CallerEdges)
4184 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4185
4186 // Skip if either no call to update, or if we ended up with no context ids
4187 // (we moved all edges onto other clones).
4188 if (!Node->hasCall() || Node->emptyContextIds())
4189 return;
4190
4191 if (Node->IsAllocation) {
4192 auto AT = allocTypeToUse(Node->AllocTypes);
4193 // If the allocation type is ambiguous, and more aggressive hinting
4194 // has been enabled via the MinClonedColdBytePercent flag, see if this
4195 // allocation should be hinted cold anyway because its fraction cold bytes
4196 // allocated is at least the given threshold.
4197 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4198 !ContextIdToContextSizeInfos.empty()) {
4199 uint64_t TotalCold = 0;
4200 uint64_t Total = 0;
4201 for (auto Id : Node->getContextIds()) {
4202 auto TypeI = ContextIdToAllocationType.find(Id);
4203 assert(TypeI != ContextIdToAllocationType.end());
4204 auto CSI = ContextIdToContextSizeInfos.find(Id);
4205 if (CSI != ContextIdToContextSizeInfos.end()) {
4206 for (auto &Info : CSI->second) {
4207 Total += Info.TotalSize;
4208 if (TypeI->second == AllocationType::Cold)
4209 TotalCold += Info.TotalSize;
4210 }
4211 }
4212 }
4213 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4215 }
4216 updateAllocationCall(Node->Call, AT);
4217 assert(Node->MatchingCalls.empty());
4218 return;
4219 }
4220
4221 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4222 return;
4223
4224 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4225 updateCall(Node->Call, CalleeFunc);
4226 // Update all the matching calls as well.
4227 for (auto &Call : Node->MatchingCalls)
4228 updateCall(Call, CalleeFunc);
4229 };
4230
4231 // Performs DFS traversal starting from allocation nodes to update calls to
4232 // reflect cloning decisions recorded earlier. For regular LTO this will
4233 // update the actual calls in the IR to call the appropriate function clone
4234 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4235 // are recorded in the summary entries.
4237 for (auto &Entry : AllocationCallToContextNodeMap)
4238 UpdateCalls(Entry.second, Visited, UpdateCalls);
4239
4240 return Changed;
4241}
4242
4244 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4246 &FuncToAliasMap) {
4247 // The first "clone" is the original copy, we should only call this if we
4248 // needed to create new clones.
4249 assert(NumClones > 1);
4251 VMaps.reserve(NumClones - 1);
4252 FunctionsClonedThinBackend++;
4253 for (unsigned I = 1; I < NumClones; I++) {
4254 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4255 auto *NewF = CloneFunction(&F, *VMaps.back());
4256 FunctionClonesThinBackend++;
4257 // Strip memprof and callsite metadata from clone as they are no longer
4258 // needed.
4259 for (auto &BB : *NewF) {
4260 for (auto &Inst : BB) {
4261 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4262 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4263 }
4264 }
4265 std::string Name = getMemProfFuncName(F.getName(), I);
4266 auto *PrevF = M.getFunction(Name);
4267 if (PrevF) {
4268 // We might have created this when adjusting callsite in another
4269 // function. It should be a declaration.
4270 assert(PrevF->isDeclaration());
4271 NewF->takeName(PrevF);
4272 PrevF->replaceAllUsesWith(NewF);
4273 PrevF->eraseFromParent();
4274 } else
4275 NewF->setName(Name);
4276 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4277 << "created clone " << ore::NV("NewFunction", NewF));
4278
4279 // Now handle aliases to this function, and clone those as well.
4280 if (!FuncToAliasMap.count(&F))
4281 continue;
4282 for (auto *A : FuncToAliasMap[&F]) {
4283 std::string Name = getMemProfFuncName(A->getName(), I);
4284 auto *PrevA = M.getNamedAlias(Name);
4285 auto *NewA = GlobalAlias::create(A->getValueType(),
4286 A->getType()->getPointerAddressSpace(),
4287 A->getLinkage(), Name, NewF);
4288 NewA->copyAttributesFrom(A);
4289 if (PrevA) {
4290 // We might have created this when adjusting callsite in another
4291 // function. It should be a declaration.
4292 assert(PrevA->isDeclaration());
4293 NewA->takeName(PrevA);
4294 PrevA->replaceAllUsesWith(NewA);
4295 PrevA->eraseFromParent();
4296 }
4297 }
4298 }
4299 return VMaps;
4300}
4301
4302// Locate the summary for F. This is complicated by the fact that it might
4303// have been internalized or promoted.
4305 const ModuleSummaryIndex *ImportSummary,
4306 const Function *CallingFunc = nullptr) {
4307 // FIXME: Ideally we would retain the original GUID in some fashion on the
4308 // function (e.g. as metadata), but for now do our best to locate the
4309 // summary without that information.
4310 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4311 if (!TheFnVI)
4312 // See if theFn was internalized, by checking index directly with
4313 // original name (this avoids the name adjustment done by getGUID() for
4314 // internal symbols).
4315 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4316 if (TheFnVI)
4317 return TheFnVI;
4318 // Now query with the original name before any promotion was performed.
4319 StringRef OrigName =
4321 // When this pass is enabled, we always add thinlto_src_file provenance
4322 // metadata to imported function definitions, which allows us to recreate the
4323 // original internal symbol's GUID.
4324 auto SrcFileMD = F.getMetadata("thinlto_src_file");
4325 // If this is a call to an imported/promoted local for which we didn't import
4326 // the definition, the metadata will not exist on the declaration. However,
4327 // since we are doing this early, before any inlining in the LTO backend, we
4328 // can simply look at the metadata on the calling function which must have
4329 // been from the same module if F was an internal symbol originally.
4330 if (!SrcFileMD && F.isDeclaration()) {
4331 // We would only call this for a declaration for a direct callsite, in which
4332 // case the caller would have provided the calling function pointer.
4333 assert(CallingFunc);
4334 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
4335 // If this is a promoted local (OrigName != F.getName()), since this is a
4336 // declaration, it must be imported from a different module and therefore we
4337 // should always find the metadata on its calling function. Any call to a
4338 // promoted local that came from this module should still be a definition.
4339 assert(SrcFileMD || OrigName == F.getName());
4340 }
4341 StringRef SrcFile = M.getSourceFileName();
4342 if (SrcFileMD)
4343 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
4344 std::string OrigId = GlobalValue::getGlobalIdentifier(
4345 OrigName, GlobalValue::InternalLinkage, SrcFile);
4346 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4347 // Internal func in original module may have gotten a numbered suffix if we
4348 // imported an external function with the same name. This happens
4349 // automatically during IR linking for naming conflicts. It would have to
4350 // still be internal in that case (otherwise it would have been renamed on
4351 // promotion in which case we wouldn't have a naming conflict).
4352 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
4353 F.getName().contains('.')) {
4354 OrigName = F.getName().rsplit('.').first;
4356 OrigName, GlobalValue::InternalLinkage, SrcFile);
4357 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4358 }
4359 // The only way we may not have a VI is if this is a declaration created for
4360 // an imported reference. For distributed ThinLTO we may not have a VI for
4361 // such declarations in the distributed summary.
4362 assert(TheFnVI || F.isDeclaration());
4363 return TheFnVI;
4364}
4365
4366bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4367 Module &M) {
4368 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4369 Symtab = std::make_unique<InstrProfSymtab>();
4370 // Don't add canonical names, to avoid multiple functions to the symtab
4371 // when they both have the same root name with "." suffixes stripped.
4372 // If we pick the wrong one then this could lead to incorrect ICP and calling
4373 // a memprof clone that we don't actually create (resulting in linker unsats).
4374 // What this means is that the GUID of the function (or its PGOFuncName
4375 // metadata) *must* match that in the VP metadata to allow promotion.
4376 // In practice this should not be a limitation, since local functions should
4377 // have PGOFuncName metadata and global function names shouldn't need any
4378 // special handling (they should not get the ".llvm.*" suffix that the
4379 // canonicalization handling is attempting to strip).
4380 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4381 std::string SymtabFailure = toString(std::move(E));
4382 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4383 return false;
4384 }
4385 return true;
4386}
4387
4388bool MemProfContextDisambiguation::applyImport(Module &M) {
4389 assert(ImportSummary);
4390 bool Changed = false;
4391
4392 // We also need to clone any aliases that reference cloned functions, because
4393 // the modified callsites may invoke via the alias. Keep track of the aliases
4394 // for each function.
4395 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4396 FuncToAliasMap;
4397 for (auto &A : M.aliases()) {
4398 auto *Aliasee = A.getAliaseeObject();
4399 if (auto *F = dyn_cast<Function>(Aliasee))
4400 FuncToAliasMap[F].insert(&A);
4401 }
4402
4403 if (!initializeIndirectCallPromotionInfo(M))
4404 return false;
4405
4406 for (auto &F : M) {
4407 if (F.isDeclaration() || isMemProfClone(F))
4408 continue;
4409
4411
4413 bool ClonesCreated = false;
4414 unsigned NumClonesCreated = 0;
4415 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4416 // We should at least have version 0 which is the original copy.
4417 assert(NumClones > 0);
4418 // If only one copy needed use original.
4419 if (NumClones == 1)
4420 return;
4421 // If we already performed cloning of this function, confirm that the
4422 // requested number of clones matches (the thin link should ensure the
4423 // number of clones for each constituent callsite is consistent within
4424 // each function), before returning.
4425 if (ClonesCreated) {
4426 assert(NumClonesCreated == NumClones);
4427 return;
4428 }
4429 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4430 // The first "clone" is the original copy, which doesn't have a VMap.
4431 assert(VMaps.size() == NumClones - 1);
4432 Changed = true;
4433 ClonesCreated = true;
4434 NumClonesCreated = NumClones;
4435 };
4436
4437 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4438 Function *CalledFunction) {
4439 // Perform cloning if not yet done.
4440 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4441
4442 assert(!isMemProfClone(*CalledFunction));
4443
4444 // Update the calls per the summary info.
4445 // Save orig name since it gets updated in the first iteration
4446 // below.
4447 auto CalleeOrigName = CalledFunction->getName();
4448 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4449 // Do nothing if this version calls the original version of its
4450 // callee.
4451 if (!StackNode.Clones[J])
4452 continue;
4453 auto NewF = M.getOrInsertFunction(
4454 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4455 CalledFunction->getFunctionType());
4456 CallBase *CBClone;
4457 // Copy 0 is the original function.
4458 if (!J)
4459 CBClone = CB;
4460 else
4461 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4462 CBClone->setCalledFunction(NewF);
4463 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4464 << ore::NV("Call", CBClone) << " in clone "
4465 << ore::NV("Caller", CBClone->getFunction())
4466 << " assigned to call function clone "
4467 << ore::NV("Callee", NewF.getCallee()));
4468 }
4469 };
4470
4471 // Locate the summary for F.
4472 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4473 // If not found, this could be an imported local (see comment in
4474 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4475 // module (where it would have been promoted to global scope so should
4476 // satisfy any reference in this module).
4477 if (!TheFnVI)
4478 continue;
4479
4480 auto *GVSummary =
4481 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4482 if (!GVSummary) {
4483 // Must have been imported, use the summary which matches the definition。
4484 // (might be multiple if this was a linkonce_odr).
4485 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4486 assert(SrcModuleMD &&
4487 "enable-import-metadata is needed to emit thinlto_src_module");
4488 StringRef SrcModule =
4489 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4490 for (auto &GVS : TheFnVI.getSummaryList()) {
4491 if (GVS->modulePath() == SrcModule) {
4492 GVSummary = GVS.get();
4493 break;
4494 }
4495 }
4496 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4497 }
4498
4499 // If this was an imported alias skip it as we won't have the function
4500 // summary, and it should be cloned in the original module.
4501 if (isa<AliasSummary>(GVSummary))
4502 continue;
4503
4504 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4505
4506 if (FS->allocs().empty() && FS->callsites().empty())
4507 continue;
4508
4509 auto SI = FS->callsites().begin();
4510 auto AI = FS->allocs().begin();
4511
4512 // To handle callsite infos synthesized for tail calls which have missing
4513 // frames in the profiled context, map callee VI to the synthesized callsite
4514 // info.
4515 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4516 // Iterate the callsites for this function in reverse, since we place all
4517 // those synthesized for tail calls at the end.
4518 for (auto CallsiteIt = FS->callsites().rbegin();
4519 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4520 auto &Callsite = *CallsiteIt;
4521 // Stop as soon as we see a non-synthesized callsite info (see comment
4522 // above loop). All the entries added for discovered tail calls have empty
4523 // stack ids.
4524 if (!Callsite.StackIdIndices.empty())
4525 break;
4526 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4527 }
4528
4529 // Keeps track of needed ICP for the function.
4530 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4531
4532 // Assume for now that the instructions are in the exact same order
4533 // as when the summary was created, but confirm this is correct by
4534 // matching the stack ids.
4535 for (auto &BB : F) {
4536 for (auto &I : BB) {
4537 auto *CB = dyn_cast<CallBase>(&I);
4538 // Same handling as when creating module summary.
4539 if (!mayHaveMemprofSummary(CB))
4540 continue;
4541
4542 auto *CalledValue = CB->getCalledOperand();
4543 auto *CalledFunction = CB->getCalledFunction();
4544 if (CalledValue && !CalledFunction) {
4545 CalledValue = CalledValue->stripPointerCasts();
4546 // Stripping pointer casts can reveal a called function.
4547 CalledFunction = dyn_cast<Function>(CalledValue);
4548 }
4549 // Check if this is an alias to a function. If so, get the
4550 // called aliasee for the checks below.
4551 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4552 assert(!CalledFunction &&
4553 "Expected null called function in callsite for alias");
4554 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4555 }
4556
4558 I.getMetadata(LLVMContext::MD_callsite));
4559 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4560
4561 // Include allocs that were already assigned a memprof function
4562 // attribute in the statistics.
4563 if (CB->getAttributes().hasFnAttr("memprof")) {
4564 assert(!MemProfMD);
4565 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4566 ? AllocTypeColdThinBackend++
4567 : AllocTypeNotColdThinBackend++;
4568 OrigAllocsThinBackend++;
4569 AllocVersionsThinBackend++;
4570 if (!MaxAllocVersionsThinBackend)
4571 MaxAllocVersionsThinBackend = 1;
4572 continue;
4573 }
4574
4575 if (MemProfMD) {
4576 // Consult the next alloc node.
4577 assert(AI != FS->allocs().end());
4578 auto &AllocNode = *(AI++);
4579
4580 // Sanity check that the MIB stack ids match between the summary and
4581 // instruction metadata.
4582 auto MIBIter = AllocNode.MIBs.begin();
4583 for (auto &MDOp : MemProfMD->operands()) {
4584 assert(MIBIter != AllocNode.MIBs.end());
4585 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4586 MIBIter->StackIdIndices.begin();
4587 auto *MIBMD = cast<const MDNode>(MDOp);
4588 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4589 assert(StackMDNode);
4590 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4591 auto ContextIterBegin =
4592 StackContext.beginAfterSharedPrefix(CallsiteContext);
4593 // Skip the checking on the first iteration.
4594 uint64_t LastStackContextId =
4595 (ContextIterBegin != StackContext.end() &&
4596 *ContextIterBegin == 0)
4597 ? 1
4598 : 0;
4599 for (auto ContextIter = ContextIterBegin;
4600 ContextIter != StackContext.end(); ++ContextIter) {
4601 // If this is a direct recursion, simply skip the duplicate
4602 // entries, to be consistent with how the summary ids were
4603 // generated during ModuleSummaryAnalysis.
4604 if (LastStackContextId == *ContextIter)
4605 continue;
4606 LastStackContextId = *ContextIter;
4607 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4608 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4609 *ContextIter);
4610 StackIdIndexIter++;
4611 }
4612 MIBIter++;
4613 }
4614
4615 // Perform cloning if not yet done.
4616 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4617
4618 OrigAllocsThinBackend++;
4619 AllocVersionsThinBackend += AllocNode.Versions.size();
4620 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4621 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4622
4623 // If there is only one version that means we didn't end up
4624 // considering this function for cloning, and in that case the alloc
4625 // will still be none type or should have gotten the default NotCold.
4626 // Skip that after calling clone helper since that does some sanity
4627 // checks that confirm we haven't decided yet that we need cloning.
4628 // We might have a single version that is cold due to the
4629 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4630 // case.
4631 if (AllocNode.Versions.size() == 1 &&
4632 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4633 assert((AllocationType)AllocNode.Versions[0] ==
4635 (AllocationType)AllocNode.Versions[0] ==
4637 UnclonableAllocsThinBackend++;
4638 continue;
4639 }
4640
4641 // All versions should have a singular allocation type.
4642 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4643 return Type == ((uint8_t)AllocationType::NotCold |
4644 (uint8_t)AllocationType::Cold);
4645 }));
4646
4647 // Update the allocation types per the summary info.
4648 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4649 // Ignore any that didn't get an assigned allocation type.
4650 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4651 continue;
4652 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4653 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4654 : AllocTypeNotColdThinBackend++;
4655 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4656 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4657 AllocTypeString);
4658 CallBase *CBClone;
4659 // Copy 0 is the original function.
4660 if (!J)
4661 CBClone = CB;
4662 else
4663 // Since VMaps are only created for new clones, we index with
4664 // clone J-1 (J==0 is the original clone and does not have a VMaps
4665 // entry).
4666 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4667 CBClone->addFnAttr(A);
4668 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4669 << ore::NV("AllocationCall", CBClone) << " in clone "
4670 << ore::NV("Caller", CBClone->getFunction())
4671 << " marked with memprof allocation attribute "
4672 << ore::NV("Attribute", AllocTypeString));
4673 }
4674 } else if (!CallsiteContext.empty()) {
4675 if (!CalledFunction) {
4676#ifndef NDEBUG
4677 // We should have skipped inline assembly calls.
4678 auto *CI = dyn_cast<CallInst>(CB);
4679 assert(!CI || !CI->isInlineAsm());
4680#endif
4681 // We should have skipped direct calls via a Constant.
4682 assert(CalledValue && !isa<Constant>(CalledValue));
4683
4684 // This is an indirect call, see if we have profile information and
4685 // whether any clones were recorded for the profiled targets (that
4686 // we synthesized CallsiteInfo summary records for when building the
4687 // index).
4688 auto NumClones =
4689 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4690
4691 // Perform cloning if not yet done. This is done here in case
4692 // we don't need to do ICP, but might need to clone this
4693 // function as it is the target of other cloned calls.
4694 if (NumClones)
4695 CloneFuncIfNeeded(NumClones);
4696 }
4697
4698 else {
4699 // Consult the next callsite node.
4700 assert(SI != FS->callsites().end());
4701 auto &StackNode = *(SI++);
4702
4703#ifndef NDEBUG
4704 // Sanity check that the stack ids match between the summary and
4705 // instruction metadata.
4706 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4707 for (auto StackId : CallsiteContext) {
4708 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4709 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4710 StackId);
4711 StackIdIndexIter++;
4712 }
4713#endif
4714
4715 CloneCallsite(StackNode, CB, CalledFunction);
4716 }
4717 } else if (CB->isTailCall() && CalledFunction) {
4718 // Locate the synthesized callsite info for the callee VI, if any was
4719 // created, and use that for cloning.
4720 ValueInfo CalleeVI =
4721 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
4722 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4723 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4724 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4725 CloneCallsite(Callsite->second, CB, CalledFunction);
4726 }
4727 }
4728 }
4729 }
4730
4731 // Now do any promotion required for cloning.
4732 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4733 }
4734
4735 // We skip some of the functions and instructions above, so remove all the
4736 // metadata in a single sweep here.
4737 for (auto &F : M) {
4738 // We can skip memprof clones because createFunctionClones already strips
4739 // the metadata from the newly created clones.
4740 if (F.isDeclaration() || isMemProfClone(F))
4741 continue;
4742 for (auto &BB : F) {
4743 for (auto &I : BB) {
4744 if (!isa<CallBase>(I))
4745 continue;
4746 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4747 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4748 }
4749 }
4750 }
4751
4752 return Changed;
4753}
4754
4755unsigned MemProfContextDisambiguation::recordICPInfo(
4756 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4758 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4759 // First see if we have profile information for this indirect call.
4760 uint32_t NumCandidates;
4761 uint64_t TotalCount;
4762 auto CandidateProfileData =
4763 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4764 NumCandidates);
4765 if (CandidateProfileData.empty())
4766 return 0;
4767
4768 // Iterate through all of the candidate profiled targets along with the
4769 // CallsiteInfo summary records synthesized for them when building the index,
4770 // and see if any are cloned and/or refer to clones.
4771 bool ICPNeeded = false;
4772 unsigned NumClones = 0;
4773 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4774 for (const auto &Candidate : CandidateProfileData) {
4775#ifndef NDEBUG
4776 auto CalleeValueInfo =
4777#endif
4778 ImportSummary->getValueInfo(Candidate.Value);
4779 // We might not have a ValueInfo if this is a distributed
4780 // ThinLTO backend and decided not to import that function.
4781 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4782 assert(SI != AllCallsites.end());
4783 auto &StackNode = *(SI++);
4784 // See if any of the clones of the indirect callsite for this
4785 // profiled target should call a cloned version of the profiled
4786 // target. We only need to do the ICP here if so.
4787 ICPNeeded |= llvm::any_of(StackNode.Clones,
4788 [](unsigned CloneNo) { return CloneNo != 0; });
4789 // Every callsite in the same function should have been cloned the same
4790 // number of times.
4791 assert(!NumClones || NumClones == StackNode.Clones.size());
4792 NumClones = StackNode.Clones.size();
4793 }
4794 if (!ICPNeeded)
4795 return NumClones;
4796 // Save information for ICP, which is performed later to avoid messing up the
4797 // current function traversal.
4798 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4799 TotalCount, CallsiteInfoStartIndex});
4800 return NumClones;
4801}
4802
4803void MemProfContextDisambiguation::performICP(
4804 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4805 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4806 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4808 // Now do any promotion required for cloning. Specifically, for each
4809 // recorded ICP candidate (which was only recorded because one clone of that
4810 // candidate should call a cloned target), we perform ICP (speculative
4811 // devirtualization) for each clone of the callsite, and update its callee
4812 // to the appropriate clone. Note that the ICP compares against the original
4813 // version of the target, which is what is in the vtable.
4814 for (auto &Info : ICallAnalysisInfo) {
4815 auto *CB = Info.CB;
4816 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4817 auto TotalCount = Info.TotalCount;
4818 unsigned NumPromoted = 0;
4819 unsigned NumClones = 0;
4820
4821 for (auto &Candidate : Info.CandidateProfileData) {
4822 auto &StackNode = AllCallsites[CallsiteIndex++];
4823
4824 // All calls in the same function must have the same number of clones.
4825 assert(!NumClones || NumClones == StackNode.Clones.size());
4826 NumClones = StackNode.Clones.size();
4827
4828 // See if the target is in the module. If it wasn't imported, it is
4829 // possible that this profile could have been collected on a different
4830 // target (or version of the code), and we need to be conservative
4831 // (similar to what is done in the ICP pass).
4832 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4833 if (TargetFunction == nullptr ||
4834 // Any ThinLTO global dead symbol removal should have already
4835 // occurred, so it should be safe to promote when the target is a
4836 // declaration.
4837 // TODO: Remove internal option once more fully tested.
4839 TargetFunction->isDeclaration())) {
4840 ORE.emit([&]() {
4841 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4842 << "Memprof cannot promote indirect call: target with md5sum "
4843 << ore::NV("target md5sum", Candidate.Value) << " not found";
4844 });
4845 // FIXME: See if we can use the new declaration importing support to
4846 // at least get the declarations imported for this case. Hot indirect
4847 // targets should have been imported normally, however.
4848 continue;
4849 }
4850
4851 // Check if legal to promote
4852 const char *Reason = nullptr;
4853 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4854 ORE.emit([&]() {
4855 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4856 << "Memprof cannot promote indirect call to "
4857 << ore::NV("TargetFunction", TargetFunction)
4858 << " with count of " << ore::NV("TotalCount", TotalCount)
4859 << ": " << Reason;
4860 });
4861 continue;
4862 }
4863
4864 assert(!isMemProfClone(*TargetFunction));
4865
4866 // Handle each call clone, applying ICP so that each clone directly
4867 // calls the specified callee clone, guarded by the appropriate ICP
4868 // check.
4869 CallBase *CBClone = CB;
4870 for (unsigned J = 0; J < NumClones; J++) {
4871 // Copy 0 is the original function.
4872 if (J > 0)
4873 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4874 // We do the promotion using the original name, so that the comparison
4875 // is against the name in the vtable. Then just below, change the new
4876 // direct call to call the cloned function.
4877 auto &DirectCall =
4878 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4879 TotalCount, isSamplePGO, &ORE);
4880 auto *TargetToUse = TargetFunction;
4881 // Call original if this version calls the original version of its
4882 // callee.
4883 if (StackNode.Clones[J]) {
4884 TargetToUse =
4885 cast<Function>(M.getOrInsertFunction(
4886 getMemProfFuncName(TargetFunction->getName(),
4887 StackNode.Clones[J]),
4888 TargetFunction->getFunctionType())
4889 .getCallee());
4890 }
4891 DirectCall.setCalledFunction(TargetToUse);
4892 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4893 << ore::NV("Call", CBClone) << " in clone "
4894 << ore::NV("Caller", CBClone->getFunction())
4895 << " promoted and assigned to call function clone "
4896 << ore::NV("Callee", TargetToUse));
4897 }
4898
4899 // Update TotalCount (all clones should get same count above)
4900 TotalCount -= Candidate.Count;
4901 NumPromoted++;
4902 }
4903 // Adjust the MD.prof metadata for all clones, now that we have the new
4904 // TotalCount and the number promoted.
4905 CallBase *CBClone = CB;
4906 for (unsigned J = 0; J < NumClones; J++) {
4907 // Copy 0 is the original function.
4908 if (J > 0)
4909 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4910 // First delete the old one.
4911 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4912 // If all promoted, we don't need the MD.prof metadata.
4913 // Otherwise we need update with the un-promoted records back.
4914 if (TotalCount != 0)
4916 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4917 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4918 }
4919 }
4920}
4921
4922template <typename DerivedCCG, typename FuncTy, typename CallTy>
4923bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4924 if (DumpCCG) {
4925 dbgs() << "CCG before cloning:\n";
4926 dbgs() << *this;
4927 }
4928 if (ExportToDot)
4929 exportToDot("postbuild");
4930
4931 if (VerifyCCG) {
4932 check();
4933 }
4934
4935 identifyClones();
4936
4937 if (VerifyCCG) {
4938 check();
4939 }
4940
4941 if (DumpCCG) {
4942 dbgs() << "CCG after cloning:\n";
4943 dbgs() << *this;
4944 }
4945 if (ExportToDot)
4946 exportToDot("cloned");
4947
4948 bool Changed = assignFunctions();
4949
4950 if (DumpCCG) {
4951 dbgs() << "CCG after assigning function clones:\n";
4952 dbgs() << *this;
4953 }
4954 if (ExportToDot)
4955 exportToDot("clonefuncassign");
4956
4958 printTotalSizes(errs());
4959
4960 return Changed;
4961}
4962
4963bool MemProfContextDisambiguation::processModule(
4964 Module &M,
4966
4967 // If we have an import summary, then the cloning decisions were made during
4968 // the thin link on the index. Apply them and return.
4969 if (ImportSummary)
4970 return applyImport(M);
4971
4972 // TODO: If/when other types of memprof cloning are enabled beyond just for
4973 // hot and cold, we will need to change this to individually control the
4974 // AllocationType passed to addStackNodesForMIB during CCG construction.
4975 // Note that we specifically check this after applying imports above, so that
4976 // the option isn't needed to be passed to distributed ThinLTO backend
4977 // clang processes, which won't necessarily have visibility into the linker
4978 // dependences. Instead the information is communicated from the LTO link to
4979 // the backends via the combined summary index.
4980 if (!SupportsHotColdNew)
4981 return false;
4982
4983 ModuleCallsiteContextGraph CCG(M, OREGetter);
4984 return CCG.process();
4985}
4986
4988 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4989 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4990 if (ImportSummary) {
4991 // The MemProfImportSummary should only be used for testing ThinLTO
4992 // distributed backend handling via opt, in which case we don't have a
4993 // summary from the pass pipeline.
4995 return;
4996 }
4997 if (MemProfImportSummary.empty())
4998 return;
4999
5000 auto ReadSummaryFile =
5002 if (!ReadSummaryFile) {
5003 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
5004 "Error loading file '" + MemProfImportSummary +
5005 "': ");
5006 return;
5007 }
5008 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
5009 if (!ImportSummaryForTestingOrErr) {
5010 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
5011 "Error parsing file '" + MemProfImportSummary +
5012 "': ");
5013 return;
5014 }
5015 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5016 ImportSummary = ImportSummaryForTesting.get();
5017}
5018
5021 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
5022 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5024 };
5025 if (!processModule(M, OREGetter))
5026 return PreservedAnalyses::all();
5027 return PreservedAnalyses::none();
5028}
5029
5033 isPrevailing) {
5034 // TODO: If/when other types of memprof cloning are enabled beyond just for
5035 // hot and cold, we will need to change this to individually control the
5036 // AllocationType passed to addStackNodesForMIB during CCG construction.
5037 // The index was set from the option, so these should be in sync.
5038 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5039 if (!SupportsHotColdNew)
5040 return;
5041
5042 IndexCallsiteContextGraph CCG(Index, isPrevailing);
5043 CCG.process();
5044}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
cl::opt< unsigned > MinClonedColdBytePercent
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1474
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1380
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:576
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:410
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:315
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:596
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:185
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
Definition: SetOperations.h:93
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
Summary of memprof callsite metadata.
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
static SimpleType getSimplifiedValue(IndexCall &Val)
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition: Casting.h:34