LLVM 21.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
21#include "llvm/IR/DebugLoc.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/Module.h"
26#include <forward_list>
27#include <map>
28#include <optional>
29
30namespace llvm {
31class CanonicalLoopInfo;
32struct TargetRegionEntryInfo;
33class OffloadEntriesInfoManager;
34class OpenMPIRBuilder;
35
36/// Move the instruction after an InsertPoint to the beginning of another
37/// BasicBlock.
38///
39/// The instructions after \p IP are moved to the beginning of \p New which must
40/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
41/// \p New will be added such that there is no semantic change. Otherwise, the
42/// \p IP insert block remains degenerate and it is up to the caller to insert a
43/// terminator. \p DL is used as the debug location for the branch instruction
44/// if one is created.
45void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch,
46 DebugLoc DL);
47
48/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
49/// insert location will stick to after the instruction before the insertion
50/// point (instead of moving with the instruction the InsertPoint stores
51/// internally).
52void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
53
54/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
55/// (missing the terminator).
56///
57/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
58/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
59/// is true, a branch to the new successor will new created such that
60/// semantically there is no change; otherwise the block of the insertion point
61/// remains degenerate and it is the caller's responsibility to insert a
62/// terminator. \p DL is used as the debug location for the branch instruction
63/// if one is created. Returns the new successor block.
64BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
65 DebugLoc DL, llvm::Twine Name = {});
66
67/// Split a BasicBlock at \p Builder's insertion point, even if the block is
68/// degenerate (missing the terminator). Its new insert location will stick to
69/// after the instruction before the insertion point (instead of moving with the
70/// instruction the InsertPoint stores internally).
71BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
72 llvm::Twine Name = {});
73
74/// Split a BasicBlock at \p Builder's insertion point, even if the block is
75/// degenerate (missing the terminator). Its new insert location will stick to
76/// after the instruction before the insertion point (instead of moving with the
77/// instruction the InsertPoint stores internally).
78BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
79
80/// Like splitBB, but reuses the current block's name for the new name.
81BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
82 llvm::Twine Suffix = ".split");
83
84/// Captures attributes that affect generating LLVM-IR using the
85/// OpenMPIRBuilder and related classes. Note that not all attributes are
86/// required for all classes or functions. In some use cases the configuration
87/// is not necessary at all, because because the only functions that are called
88/// are ones that are not dependent on the configuration.
90public:
91 /// Flag to define whether to generate code for the role of the OpenMP host
92 /// (if set to false) or device (if set to true) in an offloading context. It
93 /// is set when the -fopenmp-is-target-device compiler frontend option is
94 /// specified.
95 std::optional<bool> IsTargetDevice;
96
97 /// Flag for specifying if the compilation is done for an accelerator. It is
98 /// set according to the architecture of the target triple and currently only
99 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
100 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
101 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
102 /// like target with the ability to work as the OpenMP host is added, or if
103 /// the capabilities of the currently supported GPU architectures are
104 /// expanded.
105 std::optional<bool> IsGPU;
106
107 /// Flag for specifying if LLVMUsed information should be emitted.
108 std::optional<bool> EmitLLVMUsedMetaInfo;
109
110 /// Flag for specifying if offloading is mandatory.
111 std::optional<bool> OpenMPOffloadMandatory;
112
113 /// First separator used between the initial two parts of a name.
114 std::optional<StringRef> FirstSeparator;
115 /// Separator used between all of the rest consecutive parts of s name
116 std::optional<StringRef> Separator;
117
118 // Grid Value for the GPU target
119 std::optional<omp::GV> GridValue;
120
121 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
122 /// false`), this contains the list of offloading triples associated, if any.
124
128 bool HasRequiresReverseOffload,
129 bool HasRequiresUnifiedAddress,
130 bool HasRequiresUnifiedSharedMemory,
131 bool HasRequiresDynamicAllocators);
132
133 // Getters functions that assert if the required values are not present.
134 bool isTargetDevice() const {
135 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
136 return *IsTargetDevice;
137 }
138
139 bool isGPU() const {
140 assert(IsGPU.has_value() && "IsGPU is not set");
141 return *IsGPU;
142 }
143
145 assert(OpenMPOffloadMandatory.has_value() &&
146 "OpenMPOffloadMandatory is not set");
148 }
149
151 assert(GridValue.has_value() && "GridValue is not set");
152 return *GridValue;
153 }
154
155 bool hasRequiresFlags() const { return RequiresFlags; }
156 bool hasRequiresReverseOffload() const;
157 bool hasRequiresUnifiedAddress() const;
159 bool hasRequiresDynamicAllocators() const;
160
161 /// Returns requires directive clauses as flags compatible with those expected
162 /// by libomptarget.
163 int64_t getRequiresFlags() const;
164
165 // Returns the FirstSeparator if set, otherwise use the default separator
166 // depending on isGPU
168 if (FirstSeparator.has_value())
169 return *FirstSeparator;
170 if (isGPU())
171 return "_";
172 return ".";
173 }
174
175 // Returns the Separator if set, otherwise use the default separator depending
176 // on isGPU
178 if (Separator.has_value())
179 return *Separator;
180 if (isGPU())
181 return "$";
182 return ".";
183 }
184
186 void setIsGPU(bool Value) { IsGPU = Value; }
192
197
198private:
199 /// Flags for specifying which requires directive clauses are present.
200 int64_t RequiresFlags;
201};
202
203/// Data structure to contain the information needed to uniquely identify
204/// a target entry.
206 /// The prefix used for kernel names.
207 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
208
209 std::string ParentName;
210 unsigned DeviceID;
211 unsigned FileID;
212 unsigned Line;
213 unsigned Count;
214
217 unsigned FileID, unsigned Line, unsigned Count = 0)
219 Count(Count) {}
220
223 unsigned DeviceID, unsigned FileID,
224 unsigned Line, unsigned Count);
225
227 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
228 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
229 RHS.Count);
230 }
231};
232
233/// Class that manages information about offload code regions and data
235 /// Number of entries registered so far.
236 OpenMPIRBuilder *OMPBuilder;
237 unsigned OffloadingEntriesNum = 0;
238
239public:
240 /// Base class of the entries info.
242 public:
243 /// Kind of a given entry.
244 enum OffloadingEntryInfoKinds : unsigned {
245 /// Entry is a target region.
247 /// Entry is a declare target variable.
249 /// Invalid entry info.
251 };
252
253 protected:
255 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
256 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
257 uint32_t Flags)
258 : Flags(Flags), Order(Order), Kind(Kind) {}
259 ~OffloadEntryInfo() = default;
260
261 public:
262 bool isValid() const { return Order != ~0u; }
263 unsigned getOrder() const { return Order; }
264 OffloadingEntryInfoKinds getKind() const { return Kind; }
265 uint32_t getFlags() const { return Flags; }
266 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
267 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
269 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
270 Addr = V;
271 }
272 static bool classof(const OffloadEntryInfo *Info) { return true; }
273
274 private:
275 /// Address of the entity that has to be mapped for offloading.
276 WeakTrackingVH Addr;
277
278 /// Flags associated with the device global.
279 uint32_t Flags = 0u;
280
281 /// Order this entry was emitted.
282 unsigned Order = ~0u;
283
285 };
286
287 /// Return true if a there are no entries defined.
288 bool empty() const;
289 /// Return number of entries defined so far.
290 unsigned size() const { return OffloadingEntriesNum; }
291
292 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
293
294 //
295 // Target region entries related.
296 //
297
298 /// Kind of the target registry entry.
300 /// Mark the entry as target region.
302 };
303
304 /// Target region entries info.
306 /// Address that can be used as the ID of the entry.
307 Constant *ID = nullptr;
308
309 public:
312 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
313 Constant *ID,
316 ID(ID) {
318 }
319
320 Constant *getID() const { return ID; }
321 void setID(Constant *V) {
322 assert(!ID && "ID has been set before!");
323 ID = V;
324 }
325 static bool classof(const OffloadEntryInfo *Info) {
326 return Info->getKind() == OffloadingEntryInfoTargetRegion;
327 }
328 };
329
330 /// Initialize target region entry.
331 /// This is ONLY needed for DEVICE compilation.
333 unsigned Order);
334 /// Register target region entry.
338 /// Return true if a target region entry with the provided information
339 /// exists.
341 bool IgnoreAddressId = false) const;
342
343 // Return the Name based on \a EntryInfo using the next available Count.
345 const TargetRegionEntryInfo &EntryInfo);
346
347 /// brief Applies action \a Action on all registered entries.
348 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
349 const OffloadEntryInfoTargetRegion &)>
351 void
353
354 //
355 // Device global variable entries related.
356 //
357
358 /// Kind of the global variable entry..
360 /// Mark the entry as a to declare target.
362 /// Mark the entry as a to declare target link.
364 /// Mark the entry as a declare target enter.
366 /// Mark the entry as having no declare target entry kind.
368 /// Mark the entry as a declare target indirect global.
370 /// Mark the entry as a register requires global.
372 };
373
374 /// Kind of device clause for declare target variables
375 /// and functions
376 /// NOTE: Currently not used as a part of a variable entry
377 /// used for Flang and Clang to interface with the variable
378 /// related registration functions
380 /// The target is marked for all devices
382 /// The target is marked for non-host devices
384 /// The target is marked for host devices
386 /// The target is marked as having no clause
388 };
389
390 /// Device global variable entries info.
392 /// Type of the global variable.
393 int64_t VarSize;
395 const std::string VarName;
396
397 public:
400 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
403 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
404 int64_t VarSize,
407 const std::string &VarName)
409 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
411 }
412
413 int64_t getVarSize() const { return VarSize; }
414 StringRef getVarName() const { return VarName; }
415 void setVarSize(int64_t Size) { VarSize = Size; }
416 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
417 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
418 static bool classof(const OffloadEntryInfo *Info) {
419 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
420 }
421 };
422
423 /// Initialize device global variable entry.
424 /// This is ONLY used for DEVICE compilation.
427 unsigned Order);
428
429 /// Register device global variable entry.
431 int64_t VarSize,
434 /// Checks if the variable with the given name has been registered already.
436 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
437 }
438 /// Applies action \a Action on all registered entries.
439 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
443
444private:
445 /// Return the count of entries at a particular source location.
446 unsigned
447 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
448
449 /// Update the count of entries at a particular source location.
450 void
451 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
452
454 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
455 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
456 EntryInfo.FileID, EntryInfo.Line, 0);
457 }
458
459 // Count of entries at a location.
460 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
461
462 // Storage for target region entries kind.
463 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
464 OffloadEntriesTargetRegionTy;
465 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
466 /// Storage for device global variable entries kind. The storage is to be
467 /// indexed by mangled name.
469 OffloadEntriesDeviceGlobalVarTy;
470 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
471};
472
473/// An interface to create LLVM-IR for OpenMP directives.
474///
475/// Each OpenMP directive has a corresponding public generator method.
477public:
478 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
479 /// not have an effect on \p M (see initialize)
481 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
482 T(Triple(M.getTargetTriple())) {}
484
486 llvm::Value *AtomicVar;
487
488 public:
494 AtomicVar(AtomicVar) {}
495
496 llvm::Value *getAtomicPointer() const override { return AtomicVar; }
499 const llvm::Twine &Name) const override {
500 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
501 allocaInst->setName(Name);
502 return allocaInst;
503 }
504 };
505 /// Initialize the internal state, this will put structures types and
506 /// potentially other helpers into the underlying module. Must be called
507 /// before any other method and only once! This internal state includes types
508 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
509 void initialize();
510
512
513 /// Finalize the underlying module, e.g., by outlining regions.
514 /// \param Fn The function to be finalized. If not used,
515 /// all functions are finalized.
516 void finalize(Function *Fn = nullptr);
517
518 /// Add attributes known for \p FnID to \p Fn.
520
521 /// Type used throughout for insertion points.
523
524 /// Type used to represent an insertion point or an error value.
526
527 /// Get the create a name using the platform specific separators.
528 /// \param Parts parts of the final name that needs separation
529 /// The created name has a first separator between the first and second part
530 /// and a second separator between all other parts.
531 /// E.g. with FirstSeparator "$" and Separator "." and
532 /// parts: "p1", "p2", "p3", "p4"
533 /// The resulting name is "p1$p2.p3.p4"
534 /// The separators are retrieved from the OpenMPIRBuilderConfig.
535 std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
536
537 /// Callback type for variable finalization (think destructors).
538 ///
539 /// \param CodeGenIP is the insertion point at which the finalization code
540 /// should be placed.
541 ///
542 /// A finalize callback knows about all objects that need finalization, e.g.
543 /// destruction, when the scope of the currently generated construct is left
544 /// at the time, and location, the callback is invoked.
545 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
546
548 /// The finalization callback provided by the last in-flight invocation of
549 /// createXXXX for the directive of kind DK.
551
552 /// The directive kind of the innermost directive that has an associated
553 /// region which might require finalization when it is left.
554 omp::Directive DK;
555
556 /// Flag to indicate if the directive is cancellable.
558 };
559
560 /// Push a finalization callback on the finalization stack.
561 ///
562 /// NOTE: Temporary solution until Clang CG is gone.
564 FinalizationStack.push_back(FI);
565 }
566
567 /// Pop the last finalization callback from the finalization stack.
568 ///
569 /// NOTE: Temporary solution until Clang CG is gone.
571
572 /// Callback type for body (=inner region) code generation
573 ///
574 /// The callback takes code locations as arguments, each describing a
575 /// location where additional instructions can be inserted.
576 ///
577 /// The CodeGenIP may be in the middle of a basic block or point to the end of
578 /// it. The basic block may have a terminator or be degenerate. The callback
579 /// function may just insert instructions at that position, but also split the
580 /// block (without the Before argument of BasicBlock::splitBasicBlock such
581 /// that the identify of the split predecessor block is preserved) and insert
582 /// additional control flow, including branches that do not lead back to what
583 /// follows the CodeGenIP. Note that since the callback is allowed to split
584 /// the block, callers must assume that InsertPoints to positions in the
585 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
586 /// such InsertPoints need to be preserved, it can split the block itself
587 /// before calling the callback.
588 ///
589 /// AllocaIP and CodeGenIP must not point to the same position.
590 ///
591 /// \param AllocaIP is the insertion point at which new alloca instructions
592 /// should be placed. The BasicBlock it is pointing to must
593 /// not be split.
594 /// \param CodeGenIP is the insertion point at which the body code should be
595 /// placed.
596 ///
597 /// \return an error, if any were triggered during execution.
599 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
600
601 // This is created primarily for sections construct as llvm::function_ref
602 // (BodyGenCallbackTy) is not storable (as described in the comments of
603 // function_ref class - function_ref contains non-ownable reference
604 // to the callable.
605 ///
606 /// \return an error, if any were triggered during execution.
608 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
609
610 /// Callback type for loop body code generation.
611 ///
612 /// \param CodeGenIP is the insertion point where the loop's body code must be
613 /// placed. This will be a dedicated BasicBlock with a
614 /// conditional branch from the loop condition check and
615 /// terminated with an unconditional branch to the loop
616 /// latch.
617 /// \param IndVar is the induction variable usable at the insertion point.
618 ///
619 /// \return an error, if any were triggered during execution.
621 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
622
623 /// Callback type for variable privatization (think copy & default
624 /// constructor).
625 ///
626 /// \param AllocaIP is the insertion point at which new alloca instructions
627 /// should be placed.
628 /// \param CodeGenIP is the insertion point at which the privatization code
629 /// should be placed.
630 /// \param Original The value being copied/created, should not be used in the
631 /// generated IR.
632 /// \param Inner The equivalent of \p Original that should be used in the
633 /// generated IR; this is equal to \p Original if the value is
634 /// a pointer and can thus be passed directly, otherwise it is
635 /// an equivalent but different value.
636 /// \param ReplVal The replacement value, thus a copy or new created version
637 /// of \p Inner.
638 ///
639 /// \returns The new insertion point where code generation continues and
640 /// \p ReplVal the replacement value.
642 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
643 Value &Inner, Value *&ReplVal)>;
644
645 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
646 /// (filename, line, column, ...).
649 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
652 : IP(IP), DL(DL) {}
655 };
656
657 /// Emitter methods for OpenMP directives.
658 ///
659 ///{
660
661 /// Generator for '#omp barrier'
662 ///
663 /// \param Loc The location where the barrier directive was encountered.
664 /// \param Kind The kind of directive that caused the barrier.
665 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
666 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
667 /// should be checked and acted upon.
668 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
669 ///
670 /// \returns The insertion point after the barrier.
672 omp::Directive Kind,
673 bool ForceSimpleCall = false,
674 bool CheckCancelFlag = true);
675
676 /// Generator for '#omp cancel'
677 ///
678 /// \param Loc The location where the directive was encountered.
679 /// \param IfCondition The evaluated 'if' clause expression, if any.
680 /// \param CanceledDirective The kind of directive that is cancled.
681 ///
682 /// \returns The insertion point after the barrier.
684 Value *IfCondition,
685 omp::Directive CanceledDirective);
686
687 /// Generator for '#omp parallel'
688 ///
689 /// \param Loc The insert and source location description.
690 /// \param AllocaIP The insertion points to be used for alloca instructions.
691 /// \param BodyGenCB Callback that will generate the region code.
692 /// \param PrivCB Callback to copy a given variable (think copy constructor).
693 /// \param FiniCB Callback to finalize variable copies.
694 /// \param IfCondition The evaluated 'if' clause expression, if any.
695 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
696 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
697 /// \param IsCancellable Flag to indicate a cancellable parallel region.
698 ///
699 /// \returns The insertion position *after* the parallel.
702 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
703 FinalizeCallbackTy FiniCB, Value *IfCondition,
704 Value *NumThreads, omp::ProcBindKind ProcBind,
705 bool IsCancellable);
706
707 /// Generator for the control flow structure of an OpenMP canonical loop.
708 ///
709 /// This generator operates on the logical iteration space of the loop, i.e.
710 /// the caller only has to provide a loop trip count of the loop as defined by
711 /// base language semantics. The trip count is interpreted as an unsigned
712 /// integer. The induction variable passed to \p BodyGenCB will be of the same
713 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
714 /// convert the logical iteration variable to the loop counter variable in the
715 /// loop body.
716 ///
717 /// \param Loc The insert and source location description. The insert
718 /// location can be between two instructions or the end of a
719 /// degenerate block (e.g. a BB under construction).
720 /// \param BodyGenCB Callback that will generate the loop body code.
721 /// \param TripCount Number of iterations the loop body is executed.
722 /// \param Name Base name used to derive BB and instruction names.
723 ///
724 /// \returns An object representing the created control flow structure which
725 /// can be used for loop-associated directives.
728 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
729 const Twine &Name = "loop");
730
731 /// Generator for the control flow structure of an OpenMP canonical loop.
732 ///
733 /// Instead of a logical iteration space, this allows specifying user-defined
734 /// loop counter values using increment, upper- and lower bounds. To
735 /// disambiguate the terminology when counting downwards, instead of lower
736 /// bounds we use \p Start for the loop counter value in the first body
737 /// iteration.
738 ///
739 /// Consider the following limitations:
740 ///
741 /// * A loop counter space over all integer values of its bit-width cannot be
742 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
743 /// stored into an 8 bit integer):
744 ///
745 /// DO I = 0, 255, 1
746 ///
747 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
748 /// effectively counting downwards:
749 ///
750 /// for (uint8_t i = 100u; i > 0; i += 127u)
751 ///
752 ///
753 /// TODO: May need to add additional parameters to represent:
754 ///
755 /// * Allow representing downcounting with unsigned integers.
756 ///
757 /// * Sign of the step and the comparison operator might disagree:
758 ///
759 /// for (int i = 0; i < 42; i -= 1u)
760 ///
761 //
762 /// \param Loc The insert and source location description.
763 /// \param BodyGenCB Callback that will generate the loop body code.
764 /// \param Start Value of the loop counter for the first iterations.
765 /// \param Stop Loop counter values past this will stop the loop.
766 /// \param Step Loop counter increment after each iteration; negative
767 /// means counting down.
768 /// \param IsSigned Whether Start, Stop and Step are signed integers.
769 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
770 /// counter.
771 /// \param ComputeIP Insertion point for instructions computing the trip
772 /// count. Can be used to ensure the trip count is available
773 /// at the outermost loop of a loop nest. If not set,
774 /// defaults to the preheader of the generated loop.
775 /// \param Name Base name used to derive BB and instruction names.
776 ///
777 /// \returns An object representing the created control flow structure which
778 /// can be used for loop-associated directives.
780 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
781 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
782 InsertPointTy ComputeIP = {}, const Twine &Name = "loop");
783
784 /// Collapse a loop nest into a single loop.
785 ///
786 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
787 /// that has the same number of innermost loop iterations as the origin loop
788 /// nest. The induction variables of the input loops are derived from the
789 /// collapsed loop's induction variable. This is intended to be used to
790 /// implement OpenMP's collapse clause. Before applying a directive,
791 /// collapseLoops normalizes a loop nest to contain only a single loop and the
792 /// directive's implementation does not need to handle multiple loops itself.
793 /// This does not remove the need to handle all loop nest handling by
794 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
795 /// modifier of the worksharing-loop directive.
796 ///
797 /// Example:
798 /// \code
799 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
800 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
801 /// body(i, j);
802 /// \endcode
803 ///
804 /// After collapsing with Loops={i,j}, the loop is changed to
805 /// \code
806 /// for (int ij = 0; ij < 63; ++ij) {
807 /// int i = ij / 9;
808 /// int j = ij % 9;
809 /// body(i, j);
810 /// }
811 /// \endcode
812 ///
813 /// In the current implementation, the following limitations apply:
814 ///
815 /// * All input loops have an induction variable of the same type.
816 ///
817 /// * The collapsed loop will have the same trip count integer type as the
818 /// input loops. Therefore it is possible that the collapsed loop cannot
819 /// represent all iterations of the input loops. For instance, assuming a
820 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
821 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
822 /// which cannot be represented in an 32-bit integer. Behavior is undefined
823 /// in this case.
824 ///
825 /// * The trip counts of every input loop must be available at \p ComputeIP.
826 /// Non-rectangular loops are not yet supported.
827 ///
828 /// * At each nest level, code between a surrounding loop and its nested loop
829 /// is hoisted into the loop body, and such code will be executed more
830 /// often than before collapsing (or not at all if any inner loop iteration
831 /// has a trip count of 0). This is permitted by the OpenMP specification.
832 ///
833 /// \param DL Debug location for instructions added for collapsing,
834 /// such as instructions to compute/derive the input loop's
835 /// induction variables.
836 /// \param Loops Loops in the loop nest to collapse. Loops are specified
837 /// from outermost-to-innermost and every control flow of a
838 /// loop's body must pass through its directly nested loop.
839 /// \param ComputeIP Where additional instruction that compute the collapsed
840 /// trip count. If not set, defaults to before the generated
841 /// loop.
842 ///
843 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
846 InsertPointTy ComputeIP);
847
848 /// Get the default alignment value for given target
849 ///
850 /// \param TargetTriple Target triple
851 /// \param Features StringMap which describes extra CPU features
852 static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
853 const StringMap<bool> &Features);
854
855 /// Retrieve (or create if non-existent) the address of a declare
856 /// target variable, used in conjunction with registerTargetGlobalVariable
857 /// to create declare target global variables.
858 ///
859 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
860 /// clause used in conjunction with the variable being registered (link,
861 /// to, enter).
862 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
863 /// clause used in conjunction with the variable being registered (nohost,
864 /// host, any)
865 /// \param IsDeclaration - boolean stating if the variable being registered
866 /// is a declaration-only and not a definition
867 /// \param IsExternallyVisible - boolean stating if the variable is externally
868 /// visible
869 /// \param EntryInfo - Unique entry information for the value generated
870 /// using getTargetEntryUniqueInfo, used to name generated pointer references
871 /// to the declare target variable
872 /// \param MangledName - the mangled name of the variable being registered
873 /// \param GeneratedRefs - references generated by invocations of
874 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
875 /// these are required by Clang for book keeping.
876 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
877 /// \param TargetTriple - The OpenMP device target triple we are compiling
878 /// for
879 /// \param LlvmPtrTy - The type of the variable we are generating or
880 /// retrieving an address for
881 /// \param GlobalInitializer - a lambda function which creates a constant
882 /// used for initializing a pointer reference to the variable in certain
883 /// cases. If a nullptr is passed, it will default to utilising the original
884 /// variable to initialize the pointer reference.
885 /// \param VariableLinkage - a lambda function which returns the variables
886 /// linkage type, if unspecified and a nullptr is given, it will instead
887 /// utilise the linkage stored on the existing global variable in the
888 /// LLVMModule.
892 bool IsDeclaration, bool IsExternallyVisible,
893 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
894 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
895 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
896 std::function<Constant *()> GlobalInitializer,
897 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
898
899 /// Registers a target variable for device or host.
900 ///
901 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
902 /// clause used in conjunction with the variable being registered (link,
903 /// to, enter).
904 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
905 /// clause used in conjunction with the variable being registered (nohost,
906 /// host, any)
907 /// \param IsDeclaration - boolean stating if the variable being registered
908 /// is a declaration-only and not a definition
909 /// \param IsExternallyVisible - boolean stating if the variable is externally
910 /// visible
911 /// \param EntryInfo - Unique entry information for the value generated
912 /// using getTargetEntryUniqueInfo, used to name generated pointer references
913 /// to the declare target variable
914 /// \param MangledName - the mangled name of the variable being registered
915 /// \param GeneratedRefs - references generated by invocations of
916 /// registerTargetGlobalVariable these are required by Clang for book
917 /// keeping.
918 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
919 /// \param TargetTriple - The OpenMP device target triple we are compiling
920 /// for
921 /// \param GlobalInitializer - a lambda function which creates a constant
922 /// used for initializing a pointer reference to the variable in certain
923 /// cases. If a nullptr is passed, it will default to utilising the original
924 /// variable to initialize the pointer reference.
925 /// \param VariableLinkage - a lambda function which returns the variables
926 /// linkage type, if unspecified and a nullptr is given, it will instead
927 /// utilise the linkage stored on the existing global variable in the
928 /// LLVMModule.
929 /// \param LlvmPtrTy - The type of the variable we are generating or
930 /// retrieving an address for
931 /// \param Addr - the original llvm value (addr) of the variable to be
932 /// registered
936 bool IsDeclaration, bool IsExternallyVisible,
937 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
938 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
939 std::vector<Triple> TargetTriple,
940 std::function<Constant *()> GlobalInitializer,
941 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
942 Type *LlvmPtrTy, Constant *Addr);
943
944 /// Get the offset of the OMP_MAP_MEMBER_OF field.
945 unsigned getFlagMemberOffset();
946
947 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
948 /// the position given.
949 /// \param Position - A value indicating the position of the parent
950 /// of the member in the kernel argument structure, often retrieved
951 /// by the parents position in the combined information vectors used
952 /// to generate the structure itself. Multiple children (member's of)
953 /// with the same parent will use the same returned member flag.
955
956 /// Given an initial flag set, this function modifies it to contain
957 /// the passed in MemberOfFlag generated from the getMemberOfFlag
958 /// function. The results are dependent on the existing flag bits
959 /// set in the original flag set.
960 /// \param Flags - The original set of flags to be modified with the
961 /// passed in MemberOfFlag.
962 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
963 /// slightly based on the getMemberOfFlag which adjusts the flag bits
964 /// based on the members position in its parent.
966 omp::OpenMPOffloadMappingFlags MemberOfFlag);
967
968private:
969 /// Modifies the canonical loop to be a statically-scheduled workshare loop
970 /// which is executed on the device
971 ///
972 /// This takes a \p CLI representing a canonical loop, such as the one
973 /// created by \see createCanonicalLoop and emits additional instructions to
974 /// turn it into a workshare loop. In particular, it calls to an OpenMP
975 /// runtime function in the preheader to call OpenMP device rtl function
976 /// which handles worksharing of loop body interations.
977 ///
978 /// \param DL Debug location for instructions added for the
979 /// workshare-loop construct itself.
980 /// \param CLI A descriptor of the canonical loop to workshare.
981 /// \param AllocaIP An insertion point for Alloca instructions usable in the
982 /// preheader of the loop.
983 /// \param LoopType Information about type of loop worksharing.
984 /// It corresponds to type of loop workshare OpenMP pragma.
985 ///
986 /// \returns Point where to insert code after the workshare construct.
987 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
988 InsertPointTy AllocaIP,
989 omp::WorksharingLoopType LoopType);
990
991 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
992 ///
993 /// This takes a \p LoopInfo representing a canonical loop, such as the one
994 /// created by \p createCanonicalLoop and emits additional instructions to
995 /// turn it into a workshare loop. In particular, it calls to an OpenMP
996 /// runtime function in the preheader to obtain the loop bounds to be used in
997 /// the current thread, updates the relevant instructions in the canonical
998 /// loop and calls to an OpenMP runtime finalization function after the loop.
999 ///
1000 /// \param DL Debug location for instructions added for the
1001 /// workshare-loop construct itself.
1002 /// \param CLI A descriptor of the canonical loop to workshare.
1003 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1004 /// preheader of the loop.
1005 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1006 /// the loop.
1007 ///
1008 /// \returns Point where to insert code after the workshare construct.
1009 InsertPointOrErrorTy applyStaticWorkshareLoop(DebugLoc DL,
1010 CanonicalLoopInfo *CLI,
1011 InsertPointTy AllocaIP,
1012 bool NeedsBarrier);
1013
1014 /// Modifies the canonical loop a statically-scheduled workshare loop with a
1015 /// user-specified chunk size.
1016 ///
1017 /// \param DL Debug location for instructions added for the
1018 /// workshare-loop construct itself.
1019 /// \param CLI A descriptor of the canonical loop to workshare.
1020 /// \param AllocaIP An insertion point for Alloca instructions usable in
1021 /// the preheader of the loop.
1022 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1023 /// loop.
1024 /// \param ChunkSize The user-specified chunk size.
1025 ///
1026 /// \returns Point where to insert code after the workshare construct.
1027 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
1028 CanonicalLoopInfo *CLI,
1029 InsertPointTy AllocaIP,
1030 bool NeedsBarrier,
1031 Value *ChunkSize);
1032
1033 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1034 ///
1035 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1036 /// created by \p createCanonicalLoop and emits additional instructions to
1037 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1038 /// runtime function in the preheader to obtain, and then in each iteration
1039 /// to update the loop counter.
1040 ///
1041 /// \param DL Debug location for instructions added for the
1042 /// workshare-loop construct itself.
1043 /// \param CLI A descriptor of the canonical loop to workshare.
1044 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1045 /// preheader of the loop.
1046 /// \param SchedType Type of scheduling to be passed to the init function.
1047 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1048 /// the loop.
1049 /// \param Chunk The size of loop chunk considered as a unit when
1050 /// scheduling. If \p nullptr, defaults to 1.
1051 ///
1052 /// \returns Point where to insert code after the workshare construct.
1053 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1054 CanonicalLoopInfo *CLI,
1055 InsertPointTy AllocaIP,
1056 omp::OMPScheduleType SchedType,
1057 bool NeedsBarrier,
1058 Value *Chunk = nullptr);
1059
1060 /// Create alternative version of the loop to support if clause
1061 ///
1062 /// OpenMP if clause can require to generate second loop. This loop
1063 /// will be executed when if clause condition is not met. createIfVersion
1064 /// adds branch instruction to the copied loop if \p ifCond is not met.
1065 ///
1066 /// \param Loop Original loop which should be versioned.
1067 /// \param IfCond Value which corresponds to if clause condition
1068 /// \param VMap Value to value map to define relation between
1069 /// original and copied loop values and loop blocks.
1070 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1071 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1072 ValueToValueMapTy &VMap, const Twine &NamePrefix = "");
1073
1074public:
1075 /// Modifies the canonical loop to be a workshare loop.
1076 ///
1077 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1078 /// created by \p createCanonicalLoop and emits additional instructions to
1079 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1080 /// runtime function in the preheader to obtain the loop bounds to be used in
1081 /// the current thread, updates the relevant instructions in the canonical
1082 /// loop and calls to an OpenMP runtime finalization function after the loop.
1083 ///
1084 /// The concrete transformation is done by applyStaticWorkshareLoop,
1085 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1086 /// on the value of \p SchedKind and \p ChunkSize.
1087 ///
1088 /// \param DL Debug location for instructions added for the
1089 /// workshare-loop construct itself.
1090 /// \param CLI A descriptor of the canonical loop to workshare.
1091 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1092 /// preheader of the loop.
1093 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1094 /// the loop.
1095 /// \param SchedKind Scheduling algorithm to use.
1096 /// \param ChunkSize The chunk size for the inner loop.
1097 /// \param HasSimdModifier Whether the simd modifier is present in the
1098 /// schedule clause.
1099 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1100 /// the schedule clause.
1101 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1102 /// present in the schedule clause.
1103 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1104 /// present.
1105 /// \param LoopType Information about type of loop worksharing.
1106 /// It corresponds to type of loop workshare OpenMP pragma.
1107 ///
1108 /// \returns Point where to insert code after the workshare construct.
1111 bool NeedsBarrier,
1112 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1113 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1114 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1115 bool HasOrderedClause = false,
1116 omp::WorksharingLoopType LoopType =
1118
1119 /// Tile a loop nest.
1120 ///
1121 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1122 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1123 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1124 /// of every loop and every tile sizes must be usable in the outermost
1125 /// loop's preheader. This implies that the loop nest is rectangular.
1126 ///
1127 /// Example:
1128 /// \code
1129 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1130 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1131 /// body(i, j);
1132 /// \endcode
1133 ///
1134 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1135 /// \code
1136 /// for (int i1 = 0; i1 < 3; ++i1)
1137 /// for (int j1 = 0; j1 < 2; ++j1)
1138 /// for (int i2 = 0; i2 < 5; ++i2)
1139 /// for (int j2 = 0; j2 < 7; ++j2)
1140 /// body(i1*3+i2, j1*3+j2);
1141 /// \endcode
1142 ///
1143 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1144 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1145 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1146 /// that are not multiples of the tile size. In the latter case the tile loop
1147 /// of the last floor-loop iteration will have fewer iterations than specified
1148 /// as its tile size.
1149 ///
1150 ///
1151 /// @param DL Debug location for instructions added by tiling, for
1152 /// instance the floor- and tile trip count computation.
1153 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1154 /// invalidated by this method, i.e. should not used after
1155 /// tiling.
1156 /// @param TileSizes For each loop in \p Loops, the tile size for that
1157 /// dimensions.
1158 ///
1159 /// \returns A list of generated loops. Contains twice as many loops as the
1160 /// input loop nest; the first half are the floor loops and the
1161 /// second half are the tile loops.
1162 std::vector<CanonicalLoopInfo *>
1164 ArrayRef<Value *> TileSizes);
1165
1166 /// Fully unroll a loop.
1167 ///
1168 /// Instead of unrolling the loop immediately (and duplicating its body
1169 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1170 /// metadata.
1171 ///
1172 /// \param DL Debug location for instructions added by unrolling.
1173 /// \param Loop The loop to unroll. The loop will be invalidated.
1175
1176 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1177 /// using LLVM's LoopUnrollPass.
1178 ///
1179 /// \param DL Debug location for instructions added by unrolling.
1180 /// \param Loop The loop to unroll. The loop will be invalidated.
1182
1183 /// Partially unroll a loop.
1184 ///
1185 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1186 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1187 /// needing the CanonicalLoopInfo allows more efficient code generation by
1188 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1189 /// A loop-associated directive applied to the unrolled loop needs to know the
1190 /// new trip count which means that if using a heuristically determined unroll
1191 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1192 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1193 /// but which assumes that some canonicalization has taken place (e.g.
1194 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1195 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1196 ///
1197 /// \param DL Debug location for instructions added by unrolling.
1198 /// \param Loop The loop to unroll. The loop will be invalidated.
1199 /// \param Factor The factor to unroll the loop by. A factor of 0
1200 /// indicates that a heuristic should be used to determine
1201 /// the unroll-factor.
1202 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1203 /// partially unrolled loop. Otherwise, uses loop metadata
1204 /// to defer unrolling to the LoopUnrollPass.
1205 void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
1206 CanonicalLoopInfo **UnrolledCLI);
1207
1208 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1209 /// is cloned. The metadata which prevents vectorization is added to
1210 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1211 /// to false.
1212 ///
1213 /// \param Loop The loop to simd-ize.
1214 /// \param AlignedVars The map which containts pairs of the pointer
1215 /// and its corresponding alignment.
1216 /// \param IfCond The value which corresponds to the if clause
1217 /// condition.
1218 /// \param Order The enum to map order clause.
1219 /// \param Simdlen The Simdlen length to apply to the simd loop.
1220 /// \param Safelen The Safelen length to apply to the simd loop.
1222 MapVector<Value *, Value *> AlignedVars, Value *IfCond,
1223 omp::OrderKind Order, ConstantInt *Simdlen,
1224 ConstantInt *Safelen);
1225
1226 /// Generator for '#omp flush'
1227 ///
1228 /// \param Loc The location where the flush directive was encountered
1229 void createFlush(const LocationDescription &Loc);
1230
1231 /// Generator for '#omp taskwait'
1232 ///
1233 /// \param Loc The location where the taskwait directive was encountered.
1234 void createTaskwait(const LocationDescription &Loc);
1235
1236 /// Generator for '#omp taskyield'
1237 ///
1238 /// \param Loc The location where the taskyield directive was encountered.
1239 void createTaskyield(const LocationDescription &Loc);
1240
1241 /// A struct to pack the relevant information for an OpenMP depend clause.
1242 struct DependData {
1246 explicit DependData() = default;
1248 Value *DepVal)
1250 };
1251
1252 /// Generator for `#omp task`
1253 ///
1254 /// \param Loc The location where the task construct was encountered.
1255 /// \param AllocaIP The insertion point to be used for alloca instructions.
1256 /// \param BodyGenCB Callback that will generate the region code.
1257 /// \param Tied True if the task is tied, false if the task is untied.
1258 /// \param Final i1 value which is `true` if the task is final, `false` if the
1259 /// task is not final.
1260 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1261 /// task is generated, and the encountering thread must
1262 /// suspend the current task region, for which execution
1263 /// cannot be resumed until execution of the structured
1264 /// block that is associated with the generated task is
1265 /// completed.
1266 /// \param EventHandle If present, signifies the event handle as part of
1267 /// the detach clause
1268 /// \param Mergeable If the given task is `mergeable`
1269 /// \param priority `priority-value' specifies the execution order of the
1270 /// tasks that is generated by the construct
1272 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1273 BodyGenCallbackTy BodyGenCB, bool Tied = true,
1274 Value *Final = nullptr, Value *IfCondition = nullptr,
1275 SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1276 Value *EventHandle = nullptr, Value *Priority = nullptr);
1277
1278 /// Generator for the taskgroup construct
1279 ///
1280 /// \param Loc The location where the taskgroup construct was encountered.
1281 /// \param AllocaIP The insertion point to be used for alloca instructions.
1282 /// \param BodyGenCB Callback that will generate the region code.
1283 InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1284 InsertPointTy AllocaIP,
1285 BodyGenCallbackTy BodyGenCB);
1286
1288 std::function<std::tuple<std::string, uint64_t>()>;
1289
1290 /// Creates a unique info for a target entry when provided a filename and
1291 /// line number from.
1292 ///
1293 /// \param CallBack A callback function which should return filename the entry
1294 /// resides in as well as the line number for the target entry
1295 /// \param ParentName The name of the parent the target entry resides in, if
1296 /// any.
1299 StringRef ParentName = "");
1300
1301 /// Enum class for the RedctionGen CallBack type to be used.
1303
1304 /// ReductionGen CallBack for Clang
1305 ///
1306 /// \param CodeGenIP InsertPoint for CodeGen.
1307 /// \param Index Index of the ReductionInfo to generate code for.
1308 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1309 /// codegen, used for fixup later.
1310 /// \param RHSPtr Optionally used by Clang to
1311 /// return the RHSPtr it used for codegen, used for fixup later.
1312 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1313 /// Clang context may be old.
1315 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1316 Value **LHS, Value **RHS, Function *CurFn)>;
1317
1318 /// ReductionGen CallBack for MLIR
1319 ///
1320 /// \param CodeGenIP InsertPoint for CodeGen.
1321 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1322 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1324 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1325
1326 /// Functions used to generate atomic reductions. Such functions take two
1327 /// Values representing pointers to LHS and RHS of the reduction, as well as
1328 /// the element type of these pointers. They are expected to atomically
1329 /// update the LHS to the reduced value.
1331 InsertPointTy, Type *, Value *, Value *)>;
1332
1333 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1335
1336 /// Information about an OpenMP reduction.
1347 : ElementType(nullptr), Variable(nullptr),
1350
1351 /// Reduction element type, must match pointee type of variable.
1353
1354 /// Reduction variable of pointer type.
1356
1357 /// Thread-private partial reduction variable.
1359
1360 /// Reduction evaluation kind - scalar, complex or aggregate.
1362
1363 /// Callback for generating the reduction body. The IR produced by this will
1364 /// be used to combine two values in a thread-safe context, e.g., under
1365 /// lock or within the same thread, and therefore need not be atomic.
1367
1368 /// Clang callback for generating the reduction body. The IR produced by
1369 /// this will be used to combine two values in a thread-safe context, e.g.,
1370 /// under lock or within the same thread, and therefore need not be atomic.
1372
1373 /// Callback for generating the atomic reduction body, may be null. The IR
1374 /// produced by this will be used to atomically combine two values during
1375 /// reduction. If null, the implementation will use the non-atomic version
1376 /// along with the appropriate synchronization mechanisms.
1378 };
1379
1380 enum class CopyAction : unsigned {
1381 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1382 // the warp using shuffle instructions.
1384 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1385 ThreadCopy,
1386 };
1387
1392 };
1393
1394 /// Supporting functions for Reductions CodeGen.
1395private:
1396 /// Get the id of the current thread on the GPU.
1397 Value *getGPUThreadID();
1398
1399 /// Get the GPU warp size.
1400 Value *getGPUWarpSize();
1401
1402 /// Get the id of the warp in the block.
1403 /// We assume that the warp size is 32, which is always the case
1404 /// on the NVPTX device, to generate more efficient code.
1405 Value *getNVPTXWarpID();
1406
1407 /// Get the id of the current lane in the Warp.
1408 /// We assume that the warp size is 32, which is always the case
1409 /// on the NVPTX device, to generate more efficient code.
1410 Value *getNVPTXLaneID();
1411
1412 /// Cast value to the specified type.
1413 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1414
1415 /// This function creates calls to one of two shuffle functions to copy
1416 /// variables between lanes in a warp.
1417 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1418 Type *ElementType, Value *Offset);
1419
1420 /// Function to shuffle over the value from the remote lane.
1421 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1422 Type *ElementType, Value *Offset,
1423 Type *ReductionArrayTy);
1424
1425 /// Emit instructions to copy a Reduce list, which contains partially
1426 /// aggregated values, in the specified direction.
1427 void emitReductionListCopy(
1428 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1429 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1430 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1431
1432 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1433 /// in the same warp. It uses shuffle instructions to copy over data from
1434 /// a remote lane's stack. The reduction algorithm performed is specified
1435 /// by the fourth parameter.
1436 ///
1437 /// Algorithm Versions.
1438 /// Full Warp Reduce (argument value 0):
1439 /// This algorithm assumes that all 32 lanes are active and gathers
1440 /// data from these 32 lanes, producing a single resultant value.
1441 /// Contiguous Partial Warp Reduce (argument value 1):
1442 /// This algorithm assumes that only a *contiguous* subset of lanes
1443 /// are active. This happens for the last warp in a parallel region
1444 /// when the user specified num_threads is not an integer multiple of
1445 /// 32. This contiguous subset always starts with the zeroth lane.
1446 /// Partial Warp Reduce (argument value 2):
1447 /// This algorithm gathers data from any number of lanes at any position.
1448 /// All reduced values are stored in the lowest possible lane. The set
1449 /// of problems every algorithm addresses is a super set of those
1450 /// addressable by algorithms with a lower version number. Overhead
1451 /// increases as algorithm version increases.
1452 ///
1453 /// Terminology
1454 /// Reduce element:
1455 /// Reduce element refers to the individual data field with primitive
1456 /// data types to be combined and reduced across threads.
1457 /// Reduce list:
1458 /// Reduce list refers to a collection of local, thread-private
1459 /// reduce elements.
1460 /// Remote Reduce list:
1461 /// Remote Reduce list refers to a collection of remote (relative to
1462 /// the current thread) reduce elements.
1463 ///
1464 /// We distinguish between three states of threads that are important to
1465 /// the implementation of this function.
1466 /// Alive threads:
1467 /// Threads in a warp executing the SIMT instruction, as distinguished from
1468 /// threads that are inactive due to divergent control flow.
1469 /// Active threads:
1470 /// The minimal set of threads that has to be alive upon entry to this
1471 /// function. The computation is correct iff active threads are alive.
1472 /// Some threads are alive but they are not active because they do not
1473 /// contribute to the computation in any useful manner. Turning them off
1474 /// may introduce control flow overheads without any tangible benefits.
1475 /// Effective threads:
1476 /// In order to comply with the argument requirements of the shuffle
1477 /// function, we must keep all lanes holding data alive. But at most
1478 /// half of them perform value aggregation; we refer to this half of
1479 /// threads as effective. The other half is simply handing off their
1480 /// data.
1481 ///
1482 /// Procedure
1483 /// Value shuffle:
1484 /// In this step active threads transfer data from higher lane positions
1485 /// in the warp to lower lane positions, creating Remote Reduce list.
1486 /// Value aggregation:
1487 /// In this step, effective threads combine their thread local Reduce list
1488 /// with Remote Reduce list and store the result in the thread local
1489 /// Reduce list.
1490 /// Value copy:
1491 /// In this step, we deal with the assumption made by algorithm 2
1492 /// (i.e. contiguity assumption). When we have an odd number of lanes
1493 /// active, say 2k+1, only k threads will be effective and therefore k
1494 /// new values will be produced. However, the Reduce list owned by the
1495 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1496 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1497 /// that the contiguity assumption still holds.
1498 ///
1499 /// \param ReductionInfos Array type containing the ReductionOps.
1500 /// \param ReduceFn The reduction function.
1501 /// \param FuncAttrs Optional param to specify any function attributes that
1502 /// need to be copied to the new function.
1503 ///
1504 /// \return The ShuffleAndReduce function.
1505 Function *emitShuffleAndReduceFunction(
1507 Function *ReduceFn, AttributeList FuncAttrs);
1508
1509 /// This function emits a helper that gathers Reduce lists from the first
1510 /// lane of every active warp to lanes in the first warp.
1511 ///
1512 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1513 /// shared smem[warp_size];
1514 /// For all data entries D in reduce_data:
1515 /// sync
1516 /// If (I am the first lane in each warp)
1517 /// Copy my local D to smem[warp_id]
1518 /// sync
1519 /// if (I am the first warp)
1520 /// Copy smem[thread_id] to my local D
1521 ///
1522 /// \param Loc The insert and source location description.
1523 /// \param ReductionInfos Array type containing the ReductionOps.
1524 /// \param FuncAttrs Optional param to specify any function attributes that
1525 /// need to be copied to the new function.
1526 ///
1527 /// \return The InterWarpCopy function.
1529 emitInterWarpCopyFunction(const LocationDescription &Loc,
1530 ArrayRef<ReductionInfo> ReductionInfos,
1531 AttributeList FuncAttrs);
1532
1533 /// This function emits a helper that copies all the reduction variables from
1534 /// the team into the provided global buffer for the reduction variables.
1535 ///
1536 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1537 /// For all data entries D in reduce_data:
1538 /// Copy local D to buffer.D[Idx]
1539 ///
1540 /// \param ReductionInfos Array type containing the ReductionOps.
1541 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1542 /// \param FuncAttrs Optional param to specify any function attributes that
1543 /// need to be copied to the new function.
1544 ///
1545 /// \return The ListToGlobalCopy function.
1546 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1547 Type *ReductionsBufferTy,
1548 AttributeList FuncAttrs);
1549
1550 /// This function emits a helper that copies all the reduction variables from
1551 /// the team into the provided global buffer for the reduction variables.
1552 ///
1553 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1554 /// For all data entries D in reduce_data:
1555 /// Copy buffer.D[Idx] to local D;
1556 ///
1557 /// \param ReductionInfos Array type containing the ReductionOps.
1558 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1559 /// \param FuncAttrs Optional param to specify any function attributes that
1560 /// need to be copied to the new function.
1561 ///
1562 /// \return The GlobalToList function.
1563 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1564 Type *ReductionsBufferTy,
1565 AttributeList FuncAttrs);
1566
1567 /// This function emits a helper that reduces all the reduction variables from
1568 /// the team into the provided global buffer for the reduction variables.
1569 ///
1570 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1571 /// void *GlobPtrs[];
1572 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1573 /// ...
1574 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1575 /// reduce_function(GlobPtrs, reduce_data);
1576 ///
1577 /// \param ReductionInfos Array type containing the ReductionOps.
1578 /// \param ReduceFn The reduction function.
1579 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1580 /// \param FuncAttrs Optional param to specify any function attributes that
1581 /// need to be copied to the new function.
1582 ///
1583 /// \return The ListToGlobalReduce function.
1584 Function *
1585 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1586 Function *ReduceFn, Type *ReductionsBufferTy,
1587 AttributeList FuncAttrs);
1588
1589 /// This function emits a helper that reduces all the reduction variables from
1590 /// the team into the provided global buffer for the reduction variables.
1591 ///
1592 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1593 /// void *GlobPtrs[];
1594 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1595 /// ...
1596 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1597 /// reduce_function(reduce_data, GlobPtrs);
1598 ///
1599 /// \param ReductionInfos Array type containing the ReductionOps.
1600 /// \param ReduceFn The reduction function.
1601 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1602 /// \param FuncAttrs Optional param to specify any function attributes that
1603 /// need to be copied to the new function.
1604 ///
1605 /// \return The GlobalToListReduce function.
1606 Function *
1607 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1608 Function *ReduceFn, Type *ReductionsBufferTy,
1609 AttributeList FuncAttrs);
1610
1611 /// Get the function name of a reduction function.
1612 std::string getReductionFuncName(StringRef Name) const;
1613
1614 /// Emits reduction function.
1615 /// \param ReducerName Name of the function calling the reduction.
1616 /// \param ReductionInfos Array type containing the ReductionOps.
1617 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1618 /// CodeGenCB kind.
1619 /// \param FuncAttrs Optional param to specify any function attributes that
1620 /// need to be copied to the new function.
1621 ///
1622 /// \return The reduction function.
1623 Expected<Function *> createReductionFunction(
1624 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1626 AttributeList FuncAttrs = {});
1627
1628public:
1629 ///
1630 /// Design of OpenMP reductions on the GPU
1631 ///
1632 /// Consider a typical OpenMP program with one or more reduction
1633 /// clauses:
1634 ///
1635 /// float foo;
1636 /// double bar;
1637 /// #pragma omp target teams distribute parallel for \
1638 /// reduction(+:foo) reduction(*:bar)
1639 /// for (int i = 0; i < N; i++) {
1640 /// foo += A[i]; bar *= B[i];
1641 /// }
1642 ///
1643 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1644 /// all teams. In our OpenMP implementation on the NVPTX device an
1645 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1646 /// within a team are mapped to CUDA threads within a threadblock.
1647 /// Our goal is to efficiently aggregate values across all OpenMP
1648 /// threads such that:
1649 ///
1650 /// - the compiler and runtime are logically concise, and
1651 /// - the reduction is performed efficiently in a hierarchical
1652 /// manner as follows: within OpenMP threads in the same warp,
1653 /// across warps in a threadblock, and finally across teams on
1654 /// the NVPTX device.
1655 ///
1656 /// Introduction to Decoupling
1657 ///
1658 /// We would like to decouple the compiler and the runtime so that the
1659 /// latter is ignorant of the reduction variables (number, data types)
1660 /// and the reduction operators. This allows a simpler interface
1661 /// and implementation while still attaining good performance.
1662 ///
1663 /// Pseudocode for the aforementioned OpenMP program generated by the
1664 /// compiler is as follows:
1665 ///
1666 /// 1. Create private copies of reduction variables on each OpenMP
1667 /// thread: 'foo_private', 'bar_private'
1668 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1669 /// to it and writes the result in 'foo_private' and 'bar_private'
1670 /// respectively.
1671 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1672 /// and store the result on the team master:
1673 ///
1674 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1675 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1676 ///
1677 /// where:
1678 /// struct ReduceData {
1679 /// double *foo;
1680 /// double *bar;
1681 /// } reduceData
1682 /// reduceData.foo = &foo_private
1683 /// reduceData.bar = &bar_private
1684 ///
1685 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1686 /// auxiliary functions generated by the compiler that operate on
1687 /// variables of type 'ReduceData'. They aid the runtime perform
1688 /// algorithmic steps in a data agnostic manner.
1689 ///
1690 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1691 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1692 /// same warp. It takes the following arguments as input:
1693 ///
1694 /// a. variable of type 'ReduceData' on the calling lane,
1695 /// b. its lane_id,
1696 /// c. an offset relative to the current lane_id to generate a
1697 /// remote_lane_id. The remote lane contains the second
1698 /// variable of type 'ReduceData' that is to be reduced.
1699 /// d. an algorithm version parameter determining which reduction
1700 /// algorithm to use.
1701 ///
1702 /// 'shuffleReduceFn' retrieves data from the remote lane using
1703 /// efficient GPU shuffle intrinsics and reduces, using the
1704 /// algorithm specified by the 4th parameter, the two operands
1705 /// element-wise. The result is written to the first operand.
1706 ///
1707 /// Different reduction algorithms are implemented in different
1708 /// runtime functions, all calling 'shuffleReduceFn' to perform
1709 /// the essential reduction step. Therefore, based on the 4th
1710 /// parameter, this function behaves slightly differently to
1711 /// cooperate with the runtime to ensure correctness under
1712 /// different circumstances.
1713 ///
1714 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1715 /// reduced variables across warps. It tunnels, through CUDA
1716 /// shared memory, the thread-private data of type 'ReduceData'
1717 /// from lane 0 of each warp to a lane in the first warp.
1718 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1719 /// The last team writes the global reduced value to memory.
1720 ///
1721 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1722 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1723 /// scratchpadCopyFn, loadAndReduceFn)
1724 ///
1725 /// 'scratchpadCopyFn' is a helper that stores reduced
1726 /// data from the team master to a scratchpad array in
1727 /// global memory.
1728 ///
1729 /// 'loadAndReduceFn' is a helper that loads data from
1730 /// the scratchpad array and reduces it with the input
1731 /// operand.
1732 ///
1733 /// These compiler generated functions hide address
1734 /// calculation and alignment information from the runtime.
1735 /// 5. if ret == 1:
1736 /// The team master of the last team stores the reduced
1737 /// result to the globals in memory.
1738 /// foo += reduceData.foo; bar *= reduceData.bar
1739 ///
1740 ///
1741 /// Warp Reduction Algorithms
1742 ///
1743 /// On the warp level, we have three algorithms implemented in the
1744 /// OpenMP runtime depending on the number of active lanes:
1745 ///
1746 /// Full Warp Reduction
1747 ///
1748 /// The reduce algorithm within a warp where all lanes are active
1749 /// is implemented in the runtime as follows:
1750 ///
1751 /// full_warp_reduce(void *reduce_data,
1752 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1753 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1754 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1755 /// }
1756 ///
1757 /// The algorithm completes in log(2, WARPSIZE) steps.
1758 ///
1759 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1760 /// not used therefore we save instructions by not retrieving lane_id
1761 /// from the corresponding special registers. The 4th parameter, which
1762 /// represents the version of the algorithm being used, is set to 0 to
1763 /// signify full warp reduction.
1764 ///
1765 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1766 ///
1767 /// #reduce_elem refers to an element in the local lane's data structure
1768 /// #remote_elem is retrieved from a remote lane
1769 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1770 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1771 ///
1772 /// Contiguous Partial Warp Reduction
1773 ///
1774 /// This reduce algorithm is used within a warp where only the first
1775 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1776 /// number of OpenMP threads in a parallel region is not a multiple of
1777 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1778 ///
1779 /// void
1780 /// contiguous_partial_reduce(void *reduce_data,
1781 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1782 /// int size, int lane_id) {
1783 /// int curr_size;
1784 /// int offset;
1785 /// curr_size = size;
1786 /// mask = curr_size/2;
1787 /// while (offset>0) {
1788 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1789 /// curr_size = (curr_size+1)/2;
1790 /// offset = curr_size/2;
1791 /// }
1792 /// }
1793 ///
1794 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1795 ///
1796 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1797 /// if (lane_id < offset)
1798 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1799 /// else
1800 /// reduce_elem = remote_elem
1801 ///
1802 /// This algorithm assumes that the data to be reduced are located in a
1803 /// contiguous subset of lanes starting from the first. When there is
1804 /// an odd number of active lanes, the data in the last lane is not
1805 /// aggregated with any other lane's dat but is instead copied over.
1806 ///
1807 /// Dispersed Partial Warp Reduction
1808 ///
1809 /// This algorithm is used within a warp when any discontiguous subset of
1810 /// lanes are active. It is used to implement the reduction operation
1811 /// across lanes in an OpenMP simd region or in a nested parallel region.
1812 ///
1813 /// void
1814 /// dispersed_partial_reduce(void *reduce_data,
1815 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1816 /// int size, remote_id;
1817 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1818 /// do {
1819 /// remote_id = next_active_lane_id_right_after_me();
1820 /// # the above function returns 0 of no active lane
1821 /// # is present right after the current lane.
1822 /// size = number_of_active_lanes_in_this_warp();
1823 /// logical_lane_id /= 2;
1824 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1825 /// remote_id-1-threadIdx.x, 2);
1826 /// } while (logical_lane_id % 2 == 0 && size > 1);
1827 /// }
1828 ///
1829 /// There is no assumption made about the initial state of the reduction.
1830 /// Any number of lanes (>=1) could be active at any position. The reduction
1831 /// result is returned in the first active lane.
1832 ///
1833 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1834 ///
1835 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1836 /// if (lane_id % 2 == 0 && offset > 0)
1837 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1838 /// else
1839 /// reduce_elem = remote_elem
1840 ///
1841 ///
1842 /// Intra-Team Reduction
1843 ///
1844 /// This function, as implemented in the runtime call
1845 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1846 /// threads in a team. It first reduces within a warp using the
1847 /// aforementioned algorithms. We then proceed to gather all such
1848 /// reduced values at the first warp.
1849 ///
1850 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1851 /// data from each of the "warp master" (zeroth lane of each warp, where
1852 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1853 /// a mathematical sense) the problem of reduction across warp masters in
1854 /// a block to the problem of warp reduction.
1855 ///
1856 ///
1857 /// Inter-Team Reduction
1858 ///
1859 /// Once a team has reduced its data to a single value, it is stored in
1860 /// a global scratchpad array. Since each team has a distinct slot, this
1861 /// can be done without locking.
1862 ///
1863 /// The last team to write to the scratchpad array proceeds to reduce the
1864 /// scratchpad array. One or more workers in the last team use the helper
1865 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1866 /// the k'th worker reduces every k'th element.
1867 ///
1868 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1869 /// reduce across workers and compute a globally reduced value.
1870 ///
1871 /// \param Loc The location where the reduction was
1872 /// encountered. Must be within the associate
1873 /// directive and after the last local access to the
1874 /// reduction variables.
1875 /// \param AllocaIP An insertion point suitable for allocas usable
1876 /// in reductions.
1877 /// \param CodeGenIP An insertion point suitable for code
1878 /// generation. \param ReductionInfos A list of info on each reduction
1879 /// variable. \param IsNoWait Optional flag set if the reduction is
1880 /// marked as
1881 /// nowait.
1882 /// \param IsTeamsReduction Optional flag set if it is a teams
1883 /// reduction.
1884 /// \param HasDistribute Optional flag set if it is a
1885 /// distribute reduction.
1886 /// \param GridValue Optional GPU grid value.
1887 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
1888 /// used for teams reduction.
1889 /// \param SrcLocInfo Source location information global.
1891 const LocationDescription &Loc, InsertPointTy AllocaIP,
1892 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
1893 bool IsNoWait = false, bool IsTeamsReduction = false,
1894 bool HasDistribute = false,
1896 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
1897 Value *SrcLocInfo = nullptr);
1898
1899 // TODO: provide atomic and non-atomic reduction generators for reduction
1900 // operators defined by the OpenMP specification.
1901
1902 /// Generator for '#omp reduction'.
1903 ///
1904 /// Emits the IR instructing the runtime to perform the specific kind of
1905 /// reductions. Expects reduction variables to have been privatized and
1906 /// initialized to reduction-neutral values separately. Emits the calls to
1907 /// runtime functions as well as the reduction function and the basic blocks
1908 /// performing the reduction atomically and non-atomically.
1909 ///
1910 /// The code emitted for the following:
1911 ///
1912 /// \code
1913 /// type var_1;
1914 /// type var_2;
1915 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
1916 /// /* body */;
1917 /// \endcode
1918 ///
1919 /// corresponds to the following sketch.
1920 ///
1921 /// \code
1922 /// void _outlined_par() {
1923 /// // N is the number of different reductions.
1924 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
1925 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
1926 /// _omp_reduction_func,
1927 /// _gomp_critical_user.reduction.var)) {
1928 /// case 1: {
1929 /// var_1 = var_1 <reduction-op> privatized_var_1;
1930 /// var_2 = var_2 <reduction-op> privatized_var_2;
1931 /// // ...
1932 /// __kmpc_end_reduce(...);
1933 /// break;
1934 /// }
1935 /// case 2: {
1936 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
1937 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
1938 /// // ...
1939 /// break;
1940 /// }
1941 /// default: break;
1942 /// }
1943 /// }
1944 ///
1945 /// void _omp_reduction_func(void **lhs, void **rhs) {
1946 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
1947 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
1948 /// // ...
1949 /// }
1950 /// \endcode
1951 ///
1952 /// \param Loc The location where the reduction was
1953 /// encountered. Must be within the associate
1954 /// directive and after the last local access to the
1955 /// reduction variables.
1956 /// \param AllocaIP An insertion point suitable for allocas usable
1957 /// in reductions.
1958 /// \param ReductionInfos A list of info on each reduction variable.
1959 /// \param IsNoWait A flag set if the reduction is marked as nowait.
1960 /// \param IsByRef A flag set if the reduction is using reference
1961 /// or direct value.
1962 InsertPointOrErrorTy createReductions(const LocationDescription &Loc,
1963 InsertPointTy AllocaIP,
1964 ArrayRef<ReductionInfo> ReductionInfos,
1965 ArrayRef<bool> IsByRef,
1966 bool IsNoWait = false);
1967
1968 ///}
1969
1970 /// Return the insertion point used by the underlying IRBuilder.
1972
1973 /// Update the internal location to \p Loc.
1975 Builder.restoreIP(Loc.IP);
1977 return Loc.IP.getBlock() != nullptr;
1978 }
1979
1980 /// Return the function declaration for the runtime function with \p FnID.
1983
1985
1986 /// Return the (LLVM-IR) string describing the source location \p LocStr.
1987 Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
1988
1989 /// Return the (LLVM-IR) string describing the default source location.
1991
1992 /// Return the (LLVM-IR) string describing the source location identified by
1993 /// the arguments.
1994 Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
1995 unsigned Line, unsigned Column,
1996 uint32_t &SrcLocStrSize);
1997
1998 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
1999 /// fallback if \p DL does not specify the function name.
2001 Function *F = nullptr);
2002
2003 /// Return the (LLVM-IR) string describing the source location \p Loc.
2004 Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2005 uint32_t &SrcLocStrSize);
2006
2007 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2008 /// TODO: Create a enum class for the Reserve2Flags
2009 Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
2010 omp::IdentFlag Flags = omp::IdentFlag(0),
2011 unsigned Reserve2Flags = 0);
2012
2013 /// Create a hidden global flag \p Name in the module with initial value \p
2014 /// Value.
2016
2017 /// Emit the llvm.used metadata.
2019
2020 /// Emit the kernel execution mode.
2023
2024 /// Generate control flow and cleanup for cancellation.
2025 ///
2026 /// \param CancelFlag Flag indicating if the cancellation is performed.
2027 /// \param CanceledDirective The kind of directive that is cancled.
2028 /// \param ExitCB Extra code to be generated in the exit block.
2029 ///
2030 /// \return an error, if any were triggered during execution.
2032 omp::Directive CanceledDirective,
2033 FinalizeCallbackTy ExitCB = {});
2034
2035 /// Generate a target region entry call.
2036 ///
2037 /// \param Loc The location at which the request originated and is fulfilled.
2038 /// \param AllocaIP The insertion point to be used for alloca instructions.
2039 /// \param Return Return value of the created function returned by reference.
2040 /// \param DeviceID Identifier for the device via the 'device' clause.
2041 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2042 /// or 0 if unspecified and -1 if there is no 'teams' clause.
2043 /// \param NumThreads Number of threads via the 'thread_limit' clause.
2044 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2045 /// \param KernelArgs Array of arguments to the kernel.
2046 InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2047 InsertPointTy AllocaIP, Value *&Return,
2048 Value *Ident, Value *DeviceID, Value *NumTeams,
2049 Value *NumThreads, Value *HostPtr,
2050 ArrayRef<Value *> KernelArgs);
2051
2052 /// Generate a flush runtime call.
2053 ///
2054 /// \param Loc The location at which the request originated and is fulfilled.
2055 void emitFlush(const LocationDescription &Loc);
2056
2057 /// The finalization stack made up of finalize callbacks currently in-flight,
2058 /// wrapped into FinalizationInfo objects that reference also the finalization
2059 /// target block and the kind of cancellable directive.
2061
2062 /// Return true if the last entry in the finalization stack is of kind \p DK
2063 /// and cancellable.
2064 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2065 return !FinalizationStack.empty() &&
2066 FinalizationStack.back().IsCancellable &&
2067 FinalizationStack.back().DK == DK;
2068 }
2069
2070 /// Generate a taskwait runtime call.
2071 ///
2072 /// \param Loc The location at which the request originated and is fulfilled.
2073 void emitTaskwaitImpl(const LocationDescription &Loc);
2074
2075 /// Generate a taskyield runtime call.
2076 ///
2077 /// \param Loc The location at which the request originated and is fulfilled.
2078 void emitTaskyieldImpl(const LocationDescription &Loc);
2079
2080 /// Return the current thread ID.
2081 ///
2082 /// \param Ident The ident (ident_t*) describing the query origin.
2084
2085 /// The OpenMPIRBuilder Configuration
2087
2088 /// The underlying LLVM-IR module
2090
2091 /// The LLVM-IR Builder used to create IR.
2093
2094 /// Map to remember source location strings
2096
2097 /// Map to remember existing ident_t*.
2099
2100 /// Info manager to keep track of target regions.
2102
2103 /// The target triple of the underlying module.
2104 const Triple T;
2105
2106 /// Helper that contains information about regions we need to outline
2107 /// during finalization.
2109 using PostOutlineCBTy = std::function<void(Function &)>;
2113
2114 /// Collect all blocks in between EntryBB and ExitBB in both the given
2115 /// vector and set.
2117 SmallVectorImpl<BasicBlock *> &BlockVector);
2118
2119 /// Return the function that contains the region to be outlined.
2120 Function *getFunction() const { return EntryBB->getParent(); }
2121 };
2122
2123 /// Collection of regions that need to be outlined during finalization.
2125
2126 /// A collection of candidate target functions that's constant allocas will
2127 /// attempt to be raised on a call of finalize after all currently enqueued
2128 /// outline info's have been processed.
2130
2131 /// Collection of owned canonical loop objects that eventually need to be
2132 /// free'd.
2133 std::forward_list<CanonicalLoopInfo> LoopInfos;
2134
2135 /// Add a new region that will be outlined later.
2136 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2137
2138 /// An ordered map of auto-generated variables to their unique names.
2139 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2140 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2141 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2142 /// variables.
2144
2145 /// Computes the size of type in bytes.
2146 Value *getSizeInBytes(Value *BasePtr);
2147
2148 // Emit a branch from the current block to the Target block only if
2149 // the current block has a terminator.
2151
2152 // If BB has no use then delete it and return. Else place BB after the current
2153 // block, if possible, or else at the end of the function. Also add a branch
2154 // from current block to BB if current block does not have a terminator.
2155 void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);
2156
2157 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2158 /// Here is the logic:
2159 /// if (Cond) {
2160 /// ThenGen();
2161 /// } else {
2162 /// ElseGen();
2163 /// }
2164 ///
2165 /// \return an error, if any were triggered during execution.
2167 BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});
2168
2169 /// Create the global variable holding the offload mappings information.
2171 std::string VarName);
2172
2173 /// Create the global variable holding the offload names information.
2176 std::string VarName);
2177
2180 AllocaInst *Args = nullptr;
2182 };
2183
2184 /// Create the allocas instruction used in call to mapper functions.
2186 InsertPointTy AllocaIP, unsigned NumOperands,
2188
2189 /// Create the call for the target mapper function.
2190 /// \param Loc The source location description.
2191 /// \param MapperFunc Function to be called.
2192 /// \param SrcLocInfo Source location information global.
2193 /// \param MaptypesArg The argument types.
2194 /// \param MapnamesArg The argument names.
2195 /// \param MapperAllocas The AllocaInst used for the call.
2196 /// \param DeviceID Device ID for the call.
2197 /// \param NumOperands Number of operands in the call.
2198 void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
2199 Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
2200 struct MapperAllocas &MapperAllocas, int64_t DeviceID,
2201 unsigned NumOperands);
2202
2203 /// Container for the arguments used to pass data to the runtime library.
2205 /// The array of base pointer passed to the runtime library.
2207 /// The array of section pointers passed to the runtime library.
2209 /// The array of sizes passed to the runtime library.
2210 Value *SizesArray = nullptr;
2211 /// The array of map types passed to the runtime library for the beginning
2212 /// of the region or for the entire region if there are no separate map
2213 /// types for the region end.
2215 /// The array of map types passed to the runtime library for the end of the
2216 /// region, or nullptr if there are no separate map types for the region
2217 /// end.
2219 /// The array of user-defined mappers passed to the runtime library.
2221 /// The array of original declaration names of mapped pointers sent to the
2222 /// runtime library for debugging
2224
2225 explicit TargetDataRTArgs() {}
2234 };
2235
2236 /// Container to pass the default attributes with which a kernel must be
2237 /// launched, used to set kernel attributes and populate associated static
2238 /// structures.
2239 ///
2240 /// For max values, < 0 means unset, == 0 means set but unknown at compile
2241 /// time. The number of max values will be 1 except for the case where
2242 /// ompx_bare is set.
2247 int32_t MinTeams = 1;
2249 int32_t MinThreads = 1;
2250 };
2251
2252 /// Container to pass LLVM IR runtime values or constants related to the
2253 /// number of teams and threads with which the kernel must be launched, as
2254 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2255 /// kernel. These must be defined in the host prior to the call to the kernel
2256 /// launch OpenMP RTL function.
2259 Value *MinTeams = nullptr;
2262
2263 /// 'parallel' construct 'num_threads' clause value, if present and it is an
2264 /// SPMD kernel.
2265 Value *MaxThreads = nullptr;
2266
2267 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2268 /// it is a generic kernel.
2270 };
2271
2272 /// Data structure that contains the needed information to construct the
2273 /// kernel args vector.
2275 /// Number of arguments passed to the runtime library.
2276 unsigned NumTargetItems = 0;
2277 /// Arguments passed to the runtime library
2279 /// The number of iterations
2281 /// The number of teams.
2283 /// The number of threads.
2285 /// The size of the dynamic shared memory.
2287 /// True if the kernel has 'no wait' clause.
2288 bool HasNoWait = false;
2289
2290 // Constructors for TargetKernelArgs.
2295 bool HasNoWait)
2300 };
2301
2302 /// Create the kernel args vector used by emitTargetKernel. This function
2303 /// creates various constant values that are used in the resulting args
2304 /// vector.
2305 static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2307 SmallVector<Value *> &ArgsVector);
2308
2309 /// Struct that keeps the information that should be kept throughout
2310 /// a 'target data' region.
2312 /// Set to true if device pointer information have to be obtained.
2313 bool RequiresDevicePointerInfo = false;
2314 /// Set to true if Clang emits separate runtime calls for the beginning and
2315 /// end of the region. These calls might have separate map type arrays.
2316 bool SeparateBeginEndCalls = false;
2317
2318 public:
2320
2323
2324 /// Indicate whether any user-defined mapper exists.
2325 bool HasMapper = false;
2326 /// The total number of pointers passed to the runtime library.
2327 unsigned NumberOfPtrs = 0u;
2328
2329 bool EmitDebug = false;
2330
2331 /// Whether the `target ... data` directive has a `nowait` clause.
2332 bool HasNoWait = false;
2333
2334 explicit TargetDataInfo() {}
2335 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2336 bool SeparateBeginEndCalls)
2337 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2338 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2339 /// Clear information about the data arrays.
2342 HasMapper = false;
2343 NumberOfPtrs = 0u;
2344 }
2345 /// Return true if the current target data information has valid arrays.
2346 bool isValid() {
2350 }
2351 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2352 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2353 };
2354
2362
2363 /// This structure contains combined information generated for mappable
2364 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2365 /// mappers, and non-contiguous information.
2366 struct MapInfosTy {
2368 bool IsNonContiguous = false;
2373 };
2381
2382 /// Append arrays in \a CurInfo.
2383 void append(MapInfosTy &CurInfo) {
2385 CurInfo.BasePointers.end());
2386 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2388 CurInfo.DevicePointers.end());
2389 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2390 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2391 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2393 CurInfo.NonContigInfo.Dims.end());
2395 CurInfo.NonContigInfo.Offsets.end());
2397 CurInfo.NonContigInfo.Counts.end());
2399 CurInfo.NonContigInfo.Strides.end());
2400 }
2401 };
2402
2403 /// Callback function type for functions emitting the host fallback code that
2404 /// is executed when the kernel launch fails. It takes an insertion point as
2405 /// parameter where the code should be emitted. It returns an insertion point
2406 /// that points right after after the emitted code.
2409
2410 /// Generate a target region entry call and host fallback call.
2411 ///
2412 /// \param Loc The location at which the request originated and is fulfilled.
2413 /// \param OutlinedFnID The ooulined function ID.
2414 /// \param EmitTargetCallFallbackCB Call back function to generate host
2415 /// fallback code.
2416 /// \param Args Data structure holding information about the kernel arguments.
2417 /// \param DeviceID Identifier for the device via the 'device' clause.
2418 /// \param RTLoc Source location identifier
2419 /// \param AllocaIP The insertion point to be used for alloca instructions.
2421 emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID,
2422 EmitFallbackCallbackTy EmitTargetCallFallbackCB,
2423 TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc,
2424 InsertPointTy AllocaIP);
2425
2426 /// Callback type for generating the bodies of device directives that require
2427 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2428 ///
2429 /// \param DeviceID The ID of the device on which the target region will
2430 /// execute.
2431 /// \param RTLoc Source location identifier
2432 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2433 /// generated task.
2434 ///
2435 /// \return an error, if any were triggered during execution.
2437 function_ref<Error(Value *DeviceID, Value *RTLoc,
2438 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2439
2440 /// Generate a target-task for the target construct
2441 ///
2442 /// \param TaskBodyCB Callback to generate the actual body of the target task.
2443 /// \param DeviceID Identifier for the device via the 'device' clause.
2444 /// \param RTLoc Source location identifier
2445 /// \param AllocaIP The insertion point to be used for alloca instructions.
2446 /// \param Dependencies Vector of DependData objects holding information of
2447 /// dependencies as specified by the 'depend' clause.
2448 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2449 /// otherwise
2451 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2454 bool HasNoWait);
2455
2456 /// Emit the arguments to be passed to the runtime library based on the
2457 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2458 /// ForEndCall, emit map types to be passed for the end of the region instead
2459 /// of the beginning.
2463 bool ForEndCall = false);
2464
2465 /// Emit an array of struct descriptors to be assigned to the offload args.
2467 InsertPointTy CodeGenIP,
2468 MapInfosTy &CombinedInfo,
2470
2471 /// Emit the arrays used to pass the captures and map information to the
2472 /// offloading runtime library. If there is no map or capture information,
2473 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2474 /// that contains information generated for mappable clauses,
2475 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2477 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2478 TargetDataInfo &Info, bool IsNonContiguous = false,
2479 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2480 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2481
2482 /// Allocates memory for and populates the arrays required for offloading
2483 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2484 /// emits their base addresses as arguments to be passed to the runtime
2485 /// library. In essence, this function is a combination of
2486 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2487 /// be preferred by clients of OpenMPIRBuilder.
2489 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2490 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2491 bool IsNonContiguous = false, bool ForEndCall = false,
2492 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2493 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2494
2495 /// Creates offloading entry for the provided entry ID \a ID, address \a
2496 /// Addr, size \a Size, and flags \a Flags.
2498 int32_t Flags, GlobalValue::LinkageTypes,
2499 StringRef Name = "");
2500
2501 /// The kind of errors that can occur when emitting the offload entries and
2502 /// metadata.
2508
2509 /// Callback function type
2511 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2512
2513 // Emit the offloading entries and metadata so that the device codegen side
2514 // can easily figure out what to emit. The produced metadata looks like
2515 // this:
2516 //
2517 // !omp_offload.info = !{!1, ...}
2518 //
2519 // We only generate metadata for function that contain target regions.
2521 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2522
2523public:
2524 /// Generator for __kmpc_copyprivate
2525 ///
2526 /// \param Loc The source location description.
2527 /// \param BufSize Number of elements in the buffer.
2528 /// \param CpyBuf List of pointers to data to be copied.
2529 /// \param CpyFn function to call for copying data.
2530 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2531 ///
2532 /// \return The insertion position *after* the CopyPrivate call.
2533
2535 llvm::Value *BufSize, llvm::Value *CpyBuf,
2536 llvm::Value *CpyFn, llvm::Value *DidIt);
2537
2538 /// Generator for '#omp single'
2539 ///
2540 /// \param Loc The source location description.
2541 /// \param BodyGenCB Callback that will generate the region code.
2542 /// \param FiniCB Callback to finalize variable copies.
2543 /// \param IsNowait If false, a barrier is emitted.
2544 /// \param CPVars copyprivate variables.
2545 /// \param CPFuncs copy functions to use for each copyprivate variable.
2546 ///
2547 /// \returns The insertion position *after* the single call.
2549 BodyGenCallbackTy BodyGenCB,
2550 FinalizeCallbackTy FiniCB, bool IsNowait,
2551 ArrayRef<llvm::Value *> CPVars = {},
2552 ArrayRef<llvm::Function *> CPFuncs = {});
2553
2554 /// Generator for '#omp master'
2555 ///
2556 /// \param Loc The insert and source location description.
2557 /// \param BodyGenCB Callback that will generate the region code.
2558 /// \param FiniCB Callback to finalize variable copies.
2559 ///
2560 /// \returns The insertion position *after* the master.
2561 InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2562 BodyGenCallbackTy BodyGenCB,
2563 FinalizeCallbackTy FiniCB);
2564
2565 /// Generator for '#omp masked'
2566 ///
2567 /// \param Loc The insert and source location description.
2568 /// \param BodyGenCB Callback that will generate the region code.
2569 /// \param FiniCB Callback to finialize variable copies.
2570 ///
2571 /// \returns The insertion position *after* the masked.
2572 InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2573 BodyGenCallbackTy BodyGenCB,
2574 FinalizeCallbackTy FiniCB, Value *Filter);
2575
2576 /// Generator for '#omp critical'
2577 ///
2578 /// \param Loc The insert and source location description.
2579 /// \param BodyGenCB Callback that will generate the region body code.
2580 /// \param FiniCB Callback to finalize variable copies.
2581 /// \param CriticalName name of the lock used by the critical directive
2582 /// \param HintInst Hint Instruction for hint clause associated with critical
2583 ///
2584 /// \returns The insertion position *after* the critical.
2585 InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2586 BodyGenCallbackTy BodyGenCB,
2587 FinalizeCallbackTy FiniCB,
2588 StringRef CriticalName, Value *HintInst);
2589
2590 /// Generator for '#omp ordered depend (source | sink)'
2591 ///
2592 /// \param Loc The insert and source location description.
2593 /// \param AllocaIP The insertion point to be used for alloca instructions.
2594 /// \param NumLoops The number of loops in depend clause.
2595 /// \param StoreValues The value will be stored in vector address.
2596 /// \param Name The name of alloca instruction.
2597 /// \param IsDependSource If true, depend source; otherwise, depend sink.
2598 ///
2599 /// \return The insertion position *after* the ordered.
2600 InsertPointTy createOrderedDepend(const LocationDescription &Loc,
2601 InsertPointTy AllocaIP, unsigned NumLoops,
2602 ArrayRef<llvm::Value *> StoreValues,
2603 const Twine &Name, bool IsDependSource);
2604
2605 /// Generator for '#omp ordered [threads | simd]'
2606 ///
2607 /// \param Loc The insert and source location description.
2608 /// \param BodyGenCB Callback that will generate the region code.
2609 /// \param FiniCB Callback to finalize variable copies.
2610 /// \param IsThreads If true, with threads clause or without clause;
2611 /// otherwise, with simd clause;
2612 ///
2613 /// \returns The insertion position *after* the ordered.
2614 InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc,
2615 BodyGenCallbackTy BodyGenCB,
2616 FinalizeCallbackTy FiniCB,
2617 bool IsThreads);
2618
2619 /// Generator for '#omp sections'
2620 ///
2621 /// \param Loc The insert and source location description.
2622 /// \param AllocaIP The insertion points to be used for alloca instructions.
2623 /// \param SectionCBs Callbacks that will generate body of each section.
2624 /// \param PrivCB Callback to copy a given variable (think copy constructor).
2625 /// \param FiniCB Callback to finalize variable copies.
2626 /// \param IsCancellable Flag to indicate a cancellable parallel region.
2627 /// \param IsNowait If true, barrier - to ensure all sections are executed
2628 /// before moving forward will not be generated.
2629 /// \returns The insertion position *after* the sections.
2631 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
2632 ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2634 bool IsCancellable, bool IsNowait);
2635
2636 /// Generator for '#omp section'
2637 ///
2638 /// \param Loc The insert and source location description.
2639 /// \param BodyGenCB Callback that will generate the region body code.
2640 /// \param FiniCB Callback to finalize variable copies.
2641 /// \returns The insertion position *after* the section.
2642 InsertPointOrErrorTy createSection(const LocationDescription &Loc,
2643 BodyGenCallbackTy BodyGenCB,
2644 FinalizeCallbackTy FiniCB);
2645
2646 /// Generator for `#omp teams`
2647 ///
2648 /// \param Loc The location where the teams construct was encountered.
2649 /// \param BodyGenCB Callback that will generate the region code.
2650 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2651 /// it is as if lower bound is specified as equal to upperbound. If
2652 /// this is non-null, then upperbound must also be non-null.
2653 /// \param NumTeamsUpper Upper bound on the number of teams.
2654 /// \param ThreadLimit on the number of threads that may participate in a
2655 /// contention group created by each team.
2656 /// \param IfExpr is the integer argument value of the if condition on the
2657 /// teams clause.
2659 createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2660 Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
2661 Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);
2662
2663 /// Generate conditional branch and relevant BasicBlocks through which private
2664 /// threads copy the 'copyin' variables from Master copy to threadprivate
2665 /// copies.
2666 ///
2667 /// \param IP insertion block for copyin conditional
2668 /// \param MasterVarPtr a pointer to the master variable
2669 /// \param PrivateVarPtr a pointer to the threadprivate variable
2670 /// \param IntPtrTy Pointer size type
2671 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2672 // and copy.in.end block
2673 ///
2674 /// \returns The insertion point where copying operation to be emitted.
2676 Value *PrivateAddr,
2677 llvm::IntegerType *IntPtrTy,
2678 bool BranchtoEnd = true);
2679
2680 /// Create a runtime call for kmpc_Alloc
2681 ///
2682 /// \param Loc The insert and source location description.
2683 /// \param Size Size of allocated memory space
2684 /// \param Allocator Allocator information instruction
2685 /// \param Name Name of call Instruction for OMP_alloc
2686 ///
2687 /// \returns CallInst to the OMP_Alloc call
2688 CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2689 Value *Allocator, std::string Name = "");
2690
2691 /// Create a runtime call for kmpc_free
2692 ///
2693 /// \param Loc The insert and source location description.
2694 /// \param Addr Address of memory space to be freed
2695 /// \param Allocator Allocator information instruction
2696 /// \param Name Name of call Instruction for OMP_Free
2697 ///
2698 /// \returns CallInst to the OMP_Free call
2699 CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2700 Value *Allocator, std::string Name = "");
2701
2702 /// Create a runtime call for kmpc_threadprivate_cached
2703 ///
2704 /// \param Loc The insert and source location description.
2705 /// \param Pointer pointer to data to be cached
2706 /// \param Size size of data to be cached
2707 /// \param Name Name of call Instruction for callinst
2708 ///
2709 /// \returns CallInst to the thread private cache call.
2710 CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
2713 const llvm::Twine &Name = Twine(""));
2714
2715 /// Create a runtime call for __tgt_interop_init
2716 ///
2717 /// \param Loc The insert and source location description.
2718 /// \param InteropVar variable to be allocated
2719 /// \param InteropType type of interop operation
2720 /// \param Device devide to which offloading will occur
2721 /// \param NumDependences number of dependence variables
2722 /// \param DependenceAddress pointer to dependence variables
2723 /// \param HaveNowaitClause does nowait clause exist
2724 ///
2725 /// \returns CallInst to the __tgt_interop_init call
2726 CallInst *createOMPInteropInit(const LocationDescription &Loc,
2727 Value *InteropVar,
2728 omp::OMPInteropType InteropType, Value *Device,
2729 Value *NumDependences,
2730 Value *DependenceAddress,
2731 bool HaveNowaitClause);
2732
2733 /// Create a runtime call for __tgt_interop_destroy
2734 ///
2735 /// \param Loc The insert and source location description.
2736 /// \param InteropVar variable to be allocated
2737 /// \param Device devide to which offloading will occur
2738 /// \param NumDependences number of dependence variables
2739 /// \param DependenceAddress pointer to dependence variables
2740 /// \param HaveNowaitClause does nowait clause exist
2741 ///
2742 /// \returns CallInst to the __tgt_interop_destroy call
2743 CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2744 Value *InteropVar, Value *Device,
2745 Value *NumDependences,
2746 Value *DependenceAddress,
2747 bool HaveNowaitClause);
2748
2749 /// Create a runtime call for __tgt_interop_use
2750 ///
2751 /// \param Loc The insert and source location description.
2752 /// \param InteropVar variable to be allocated
2753 /// \param Device devide to which offloading will occur
2754 /// \param NumDependences number of dependence variables
2755 /// \param DependenceAddress pointer to dependence variables
2756 /// \param HaveNowaitClause does nowait clause exist
2757 ///
2758 /// \returns CallInst to the __tgt_interop_use call
2759 CallInst *createOMPInteropUse(const LocationDescription &Loc,
2760 Value *InteropVar, Value *Device,
2761 Value *NumDependences, Value *DependenceAddress,
2762 bool HaveNowaitClause);
2763
2764 /// The `omp target` interface
2765 ///
2766 /// For more information about the usage of this interface,
2767 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2768 ///
2769 ///{
2770
2771 /// Create a runtime call for kmpc_target_init
2772 ///
2773 /// \param Loc The insert and source location description.
2774 /// \param Attrs Structure containing the default attributes, including
2775 /// numbers of threads and teams to launch the kernel with.
2777 const LocationDescription &Loc,
2779
2780 /// Create a runtime call for kmpc_target_deinit
2781 ///
2782 /// \param Loc The insert and source location description.
2783 /// \param TeamsReductionDataSize The maximal size of all the reduction data
2784 /// for teams reduction.
2785 /// \param TeamsReductionBufferLength The number of elements (each of up to
2786 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
2787 void createTargetDeinit(const LocationDescription &Loc,
2788 int32_t TeamsReductionDataSize = 0,
2789 int32_t TeamsReductionBufferLength = 1024);
2790
2791 ///}
2792
2793 /// Helpers to read/write kernel annotations from the IR.
2794 ///
2795 ///{
2796
2797 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
2798 /// is set.
2799 static std::pair<int32_t, int32_t>
2800 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
2801 static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
2802 int32_t LB, int32_t UB);
2803
2804 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
2805 /// is set.
2806 static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
2807 Function &Kernel);
2808 static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
2809 int32_t UB);
2810 ///}
2811
2812private:
2813 // Sets the function attributes expected for the outlined function
2814 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
2815
2816 // Creates the function ID/Address for the given outlined function.
2817 // In the case of an embedded device function the address of the function is
2818 // used, in the case of a non-offload function a constant is created.
2819 Constant *createOutlinedFunctionID(Function *OutlinedFn,
2820 StringRef EntryFnIDName);
2821
2822 // Creates the region entry address for the outlined function
2823 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
2824 StringRef EntryFnName);
2825
2826public:
2827 /// Functions used to generate a function with the given name.
2829 std::function<Expected<Function *>(StringRef FunctionName)>;
2830
2831 /// Create a unique name for the entry function using the source location
2832 /// information of the current target region. The name will be something like:
2833 ///
2834 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
2835 ///
2836 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
2837 /// mangled name of the function that encloses the target region and BB is the
2838 /// line number of the target region. CC is a count added when more than one
2839 /// region is located at the same location.
2840 ///
2841 /// If this target outline function is not an offload entry, we don't need to
2842 /// register it. This may happen if it is guarded by an if clause that is
2843 /// false at compile time, or no target archs have been specified.
2844 ///
2845 /// The created target region ID is used by the runtime library to identify
2846 /// the current target region, so it only has to be unique and not
2847 /// necessarily point to anything. It could be the pointer to the outlined
2848 /// function that implements the target region, but we aren't using that so
2849 /// that the compiler doesn't need to keep that, and could therefore inline
2850 /// the host function if proven worthwhile during optimization. In the other
2851 /// hand, if emitting code for the device, the ID has to be the function
2852 /// address so that it can retrieved from the offloading entry and launched
2853 /// by the runtime library. We also mark the outlined function to have
2854 /// external linkage in case we are emitting code for the device, because
2855 /// these functions will be entry points to the device.
2856 ///
2857 /// \param InfoManager The info manager keeping track of the offload entries
2858 /// \param EntryInfo The entry information about the function
2859 /// \param GenerateFunctionCallback The callback function to generate the code
2860 /// \param OutlinedFunction Pointer to the outlined function
2861 /// \param EntryFnIDName Name of the ID o be created
2863 FunctionGenCallback &GenerateFunctionCallback,
2864 bool IsOffloadEntry, Function *&OutlinedFn,
2865 Constant *&OutlinedFnID);
2866
2867 /// Registers the given function and sets up the attribtues of the function
2868 /// Returns the FunctionID.
2869 ///
2870 /// \param InfoManager The info manager keeping track of the offload entries
2871 /// \param EntryInfo The entry information about the function
2872 /// \param OutlinedFunction Pointer to the outlined function
2873 /// \param EntryFnName Name of the outlined function
2874 /// \param EntryFnIDName Name of the ID o be created
2876 Function *OutlinedFunction,
2877 StringRef EntryFnName,
2878 StringRef EntryFnIDName);
2879
2880 /// Type of BodyGen to use for region codegen
2881 ///
2882 /// Priv: If device pointer privatization is required, emit the body of the
2883 /// region here. It will have to be duplicated: with and without
2884 /// privatization.
2885 /// DupNoPriv: If we need device pointer privatization, we need
2886 /// to emit the body of the region with no privatization in the 'else' branch
2887 /// of the conditional.
2888 /// NoPriv: If we don't require privatization of device
2889 /// pointers, we emit the body in between the runtime calls. This avoids
2890 /// duplicating the body code.
2892
2893 /// Callback type for creating the map infos for the kernel parameters.
2894 /// \param CodeGenIP is the insertion point where code should be generated,
2895 /// if any.
2898
2899private:
2900 /// Emit the array initialization or deletion portion for user-defined mapper
2901 /// code generation. First, it evaluates whether an array section is mapped
2902 /// and whether the \a MapType instructs to delete this section. If \a IsInit
2903 /// is true, and \a MapType indicates to not delete this array, array
2904 /// initialization code is generated. If \a IsInit is false, and \a MapType
2905 /// indicates to delete this array, array deletion code is generated.
2906 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
2907 llvm::Value *Base, llvm::Value *Begin,
2908 llvm::Value *Size, llvm::Value *MapType,
2909 llvm::Value *MapName, TypeSize ElementSize,
2910 llvm::BasicBlock *ExitBB, bool IsInit);
2911
2912public:
2913 /// Emit the user-defined mapper function. The code generation follows the
2914 /// pattern in the example below.
2915 /// \code
2916 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
2917 /// void *base, void *begin,
2918 /// int64_t size, int64_t type,
2919 /// void *name = nullptr) {
2920 /// // Allocate space for an array section first or add a base/begin for
2921 /// // pointer dereference.
2922 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
2923 /// !maptype.IsDelete)
2924 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
2925 /// size*sizeof(Ty), clearToFromMember(type));
2926 /// // Map members.
2927 /// for (unsigned i = 0; i < size; i++) {
2928 /// // For each component specified by this mapper:
2929 /// for (auto c : begin[i]->all_components) {
2930 /// if (c.hasMapper())
2931 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
2932 /// c.arg_size,
2933 /// c.arg_type, c.arg_name);
2934 /// else
2935 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
2936 /// c.arg_begin, c.arg_size, c.arg_type,
2937 /// c.arg_name);
2938 /// }
2939 /// }
2940 /// // Delete the array section.
2941 /// if (size > 1 && maptype.IsDelete)
2942 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
2943 /// size*sizeof(Ty), clearToFromMember(type));
2944 /// }
2945 /// \endcode
2946 ///
2947 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
2948 /// MapInfos and returns.
2949 /// \param ElemTy DeclareMapper element type.
2950 /// \param FuncName Optional param to specify mapper function name.
2951 /// \param CustomMapperCB Optional callback to generate code related to
2952 /// custom mappers.
2954 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
2955 llvm::Value *BeginArg)>
2956 PrivAndGenMapInfoCB,
2957 llvm::Type *ElemTy, StringRef FuncName,
2958 function_ref<bool(unsigned int, Function **)> CustomMapperCB = nullptr);
2959
2960 /// Generator for '#omp target data'
2961 ///
2962 /// \param Loc The location where the target data construct was encountered.
2963 /// \param AllocaIP The insertion points to be used for alloca instructions.
2964 /// \param CodeGenIP The insertion point at which the target directive code
2965 /// should be placed.
2966 /// \param IsBegin If true then emits begin mapper call otherwise emits
2967 /// end mapper call.
2968 /// \param DeviceID Stores the DeviceID from the device clause.
2969 /// \param IfCond Value which corresponds to the if clause condition.
2970 /// \param Info Stores all information realted to the Target Data directive.
2971 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
2972 /// \param BodyGenCB Optional Callback to generate the region code.
2973 /// \param DeviceAddrCB Optional callback to generate code related to
2974 /// use_device_ptr and use_device_addr.
2975 /// \param CustomMapperCB Optional callback to generate code related to
2976 /// custom mappers.
2978 const LocationDescription &Loc, InsertPointTy AllocaIP,
2979 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
2981 omp::RuntimeFunction *MapperFunc = nullptr,
2983 BodyGenTy BodyGenType)>
2984 BodyGenCB = nullptr,
2985 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2986 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
2987 Value *SrcLocInfo = nullptr);
2988
2990 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
2991
2993 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
2994 InsertPointTy CodeGenIP)>;
2995
2996 /// Generator for '#omp target'
2997 ///
2998 /// \param Loc where the target data construct was encountered.
2999 /// \param IsOffloadEntry whether it is an offload entry.
3000 /// \param CodeGenIP The insertion point where the call to the outlined
3001 /// function should be emitted.
3002 /// \param EntryInfo The entry information about the function.
3003 /// \param DefaultAttrs Structure containing the default attributes, including
3004 /// numbers of threads and teams to launch the kernel with.
3005 /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3006 /// and teams to launch the kernel with.
3007 /// \param IfCond value of the `if` clause.
3008 /// \param Inputs The input values to the region that will be passed.
3009 /// as arguments to the outlined function.
3010 /// \param BodyGenCB Callback that will generate the region code.
3011 /// \param ArgAccessorFuncCB Callback that will generate accessors
3012 /// instructions for passed in target arguments where neccessary
3013 /// \param Dependencies A vector of DependData objects that carry
3014 /// dependency information as passed in the depend clause
3015 /// \param HasNowait Whether the target construct has a `nowait` clause or
3016 /// not.
3018 const LocationDescription &Loc, bool IsOffloadEntry,
3021 TargetRegionEntryInfo &EntryInfo,
3022 const TargetKernelDefaultAttrs &DefaultAttrs,
3023 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3024 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3025 TargetBodyGenCallbackTy BodyGenCB,
3026 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3027 SmallVector<DependData> Dependencies = {}, bool HasNowait = false);
3028
3029 /// Returns __kmpc_for_static_init_* runtime function for the specified
3030 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3031 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3032 FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
3033 bool IsGPUDistribute);
3034
3035 /// Returns __kmpc_dispatch_init_* runtime function for the specified
3036 /// size \a IVSize and sign \a IVSigned.
3037 FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);
3038
3039 /// Returns __kmpc_dispatch_next_* runtime function for the specified
3040 /// size \a IVSize and sign \a IVSigned.
3041 FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);
3042
3043 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3044 /// size \a IVSize and sign \a IVSigned.
3045 FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);
3046
3047 /// Returns __kmpc_dispatch_deinit runtime function.
3049
3050 /// Declarations for LLVM-IR types (simple, array, function and structure) are
3051 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3052 /// we provide the declarations, the initializeTypes function will provide the
3053 /// values.
3054 ///
3055 ///{
3056#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3057#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
3058 ArrayType *VarName##Ty = nullptr; \
3059 PointerType *VarName##PtrTy = nullptr;
3060#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
3061 FunctionType *VarName = nullptr; \
3062 PointerType *VarName##Ptr = nullptr;
3063#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
3064 StructType *VarName = nullptr; \
3065 PointerType *VarName##Ptr = nullptr;
3066#include "llvm/Frontend/OpenMP/OMPKinds.def"
3067
3068 ///}
3069
3070private:
3071 /// Create all simple and struct types exposed by the runtime and remember
3072 /// the llvm::PointerTypes of them for easy access later.
3073 void initializeTypes(Module &M);
3074
3075 /// Common interface for generating entry calls for OMP Directives.
3076 /// if the directive has a region/body, It will set the insertion
3077 /// point to the body
3078 ///
3079 /// \param OMPD Directive to generate entry blocks for
3080 /// \param EntryCall Call to the entry OMP Runtime Function
3081 /// \param ExitBB block where the region ends.
3082 /// \param Conditional indicate if the entry call result will be used
3083 /// to evaluate a conditional of whether a thread will execute
3084 /// body code or not.
3085 ///
3086 /// \return The insertion position in exit block
3087 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3088 BasicBlock *ExitBB,
3089 bool Conditional = false);
3090
3091 /// Common interface to finalize the region
3092 ///
3093 /// \param OMPD Directive to generate exiting code for
3094 /// \param FinIP Insertion point for emitting Finalization code and exit call
3095 /// \param ExitCall Call to the ending OMP Runtime Function
3096 /// \param HasFinalize indicate if the directive will require finalization
3097 /// and has a finalization callback in the stack that
3098 /// should be called.
3099 ///
3100 /// \return The insertion position in exit block
3101 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3102 InsertPointTy FinIP,
3103 Instruction *ExitCall,
3104 bool HasFinalize = true);
3105
3106 /// Common Interface to generate OMP inlined regions
3107 ///
3108 /// \param OMPD Directive to generate inlined region for
3109 /// \param EntryCall Call to the entry OMP Runtime Function
3110 /// \param ExitCall Call to the ending OMP Runtime Function
3111 /// \param BodyGenCB Body code generation callback.
3112 /// \param FiniCB Finalization Callback. Will be called when finalizing region
3113 /// \param Conditional indicate if the entry call result will be used
3114 /// to evaluate a conditional of whether a thread will execute
3115 /// body code or not.
3116 /// \param HasFinalize indicate if the directive will require finalization
3117 /// and has a finalization callback in the stack that
3118 /// should be called.
3119 /// \param IsCancellable if HasFinalize is set to true, indicate if the
3120 /// the directive should be cancellable.
3121 /// \return The insertion point after the region
3123 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3124 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3125 FinalizeCallbackTy FiniCB, bool Conditional = false,
3126 bool HasFinalize = true, bool IsCancellable = false);
3127
3128 /// Get the platform-specific name separator.
3129 /// \param Parts different parts of the final name that needs separation
3130 /// \param FirstSeparator First separator used between the initial two
3131 /// parts of the name.
3132 /// \param Separator separator used between all of the rest consecutive
3133 /// parts of the name
3134 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3135 StringRef FirstSeparator,
3136 StringRef Separator);
3137
3138 /// Returns corresponding lock object for the specified critical region
3139 /// name. If the lock object does not exist it is created, otherwise the
3140 /// reference to the existing copy is returned.
3141 /// \param CriticalName Name of the critical region.
3142 ///
3143 Value *getOMPCriticalRegionLock(StringRef CriticalName);
3144
3145 /// Callback type for Atomic Expression update
3146 /// ex:
3147 /// \code{.cpp}
3148 /// unsigned x = 0;
3149 /// #pragma omp atomic update
3150 /// x = Expr(x_old); //Expr() is any legal operation
3151 /// \endcode
3152 ///
3153 /// \param XOld the value of the atomic memory address to use for update
3154 /// \param IRB reference to the IRBuilder to use
3155 ///
3156 /// \returns Value to update X to.
3157 using AtomicUpdateCallbackTy =
3158 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3159
3160private:
3161 enum AtomicKind { Read, Write, Update, Capture, Compare };
3162
3163 /// Determine whether to emit flush or not
3164 ///
3165 /// \param Loc The insert and source location description.
3166 /// \param AO The required atomic ordering
3167 /// \param AK The OpenMP atomic operation kind used.
3168 ///
3169 /// \returns wether a flush was emitted or not
3170 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3171 AtomicOrdering AO, AtomicKind AK);
3172
3173 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3174 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3175 /// Only Scalar data types.
3176 ///
3177 /// \param AllocaIP The insertion point to be used for alloca
3178 /// instructions.
3179 /// \param X The target atomic pointer to be updated
3180 /// \param XElemTy The element type of the atomic pointer.
3181 /// \param Expr The value to update X with.
3182 /// \param AO Atomic ordering of the generated atomic
3183 /// instructions.
3184 /// \param RMWOp The binary operation used for update. If
3185 /// operation is not supported by atomicRMW,
3186 /// or belong to {FADD, FSUB, BAD_BINOP}.
3187 /// Then a `cmpExch` based atomic will be generated.
3188 /// \param UpdateOp Code generator for complex expressions that cannot be
3189 /// expressed through atomicrmw instruction.
3190 /// \param VolatileX true if \a X volatile?
3191 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3192 /// update expression, false otherwise.
3193 /// (e.g. true for X = X BinOp Expr)
3194 ///
3195 /// \returns A pair of the old value of X before the update, and the value
3196 /// used for the update.
3197 Expected<std::pair<Value *, Value *>>
3198 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3200 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3201 bool IsXBinopExpr);
3202
3203 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3204 ///
3205 /// \Return The instruction
3206 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3207 AtomicRMWInst::BinOp RMWOp);
3208
3209public:
3210 /// a struct to pack relevant information while generating atomic Ops
3212 Value *Var = nullptr;
3213 Type *ElemTy = nullptr;
3214 bool IsSigned = false;
3215 bool IsVolatile = false;
3216 };
3217
3218 /// Emit atomic Read for : V = X --- Only Scalar data types.
3219 ///
3220 /// \param Loc The insert and source location description.
3221 /// \param X The target pointer to be atomically read
3222 /// \param V Memory address where to store atomically read
3223 /// value
3224 /// \param AO Atomic ordering of the generated atomic
3225 /// instructions.
3226 ///
3227 /// \return Insertion point after generated atomic read IR.
3230 AtomicOrdering AO);
3231
3232 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3233 ///
3234 /// \param Loc The insert and source location description.
3235 /// \param X The target pointer to be atomically written to
3236 /// \param Expr The value to store.
3237 /// \param AO Atomic ordering of the generated atomic
3238 /// instructions.
3239 ///
3240 /// \return Insertion point after generated atomic Write IR.
3242 AtomicOpValue &X, Value *Expr,
3243 AtomicOrdering AO);
3244
3245 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3246 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3247 /// Only Scalar data types.
3248 ///
3249 /// \param Loc The insert and source location description.
3250 /// \param AllocaIP The insertion point to be used for alloca instructions.
3251 /// \param X The target atomic pointer to be updated
3252 /// \param Expr The value to update X with.
3253 /// \param AO Atomic ordering of the generated atomic instructions.
3254 /// \param RMWOp The binary operation used for update. If operation
3255 /// is not supported by atomicRMW, or belong to
3256 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3257 /// atomic will be generated.
3258 /// \param UpdateOp Code generator for complex expressions that cannot be
3259 /// expressed through atomicrmw instruction.
3260 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3261 /// update expression, false otherwise.
3262 /// (e.g. true for X = X BinOp Expr)
3263 ///
3264 /// \return Insertion point after generated atomic update IR.
3267 AtomicOpValue &X, Value *Expr, AtomicOrdering AO,
3269 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr);
3270
3271 /// Emit atomic update for constructs: --- Only Scalar data types
3272 /// V = X; X = X BinOp Expr ,
3273 /// X = X BinOp Expr; V = X,
3274 /// V = X; X = Expr BinOp X,
3275 /// X = Expr BinOp X; V = X,
3276 /// V = X; X = UpdateOp(X),
3277 /// X = UpdateOp(X); V = X,
3278 ///
3279 /// \param Loc The insert and source location description.
3280 /// \param AllocaIP The insertion point to be used for alloca instructions.
3281 /// \param X The target atomic pointer to be updated
3282 /// \param V Memory address where to store captured value
3283 /// \param Expr The value to update X with.
3284 /// \param AO Atomic ordering of the generated atomic instructions
3285 /// \param RMWOp The binary operation used for update. If
3286 /// operation is not supported by atomicRMW, or belong to
3287 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3288 /// atomic will be generated.
3289 /// \param UpdateOp Code generator for complex expressions that cannot be
3290 /// expressed through atomicrmw instruction.
3291 /// \param UpdateExpr true if X is an in place update of the form
3292 /// X = X BinOp Expr or X = Expr BinOp X
3293 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3294 /// update expression, false otherwise.
3295 /// (e.g. true for X = X BinOp Expr)
3296 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3297 /// 'v', not an updated one.
3298 ///
3299 /// \return Insertion point after generated atomic capture IR.
3302 AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
3304 AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
3305 bool IsPostfixUpdate, bool IsXBinopExpr);
3306
3307 /// Emit atomic compare for constructs: --- Only scalar data types
3308 /// cond-expr-stmt:
3309 /// x = x ordop expr ? expr : x;
3310 /// x = expr ordop x ? expr : x;
3311 /// x = x == e ? d : x;
3312 /// x = e == x ? d : x; (this one is not in the spec)
3313 /// cond-update-stmt:
3314 /// if (x ordop expr) { x = expr; }
3315 /// if (expr ordop x) { x = expr; }
3316 /// if (x == e) { x = d; }
3317 /// if (e == x) { x = d; } (this one is not in the spec)
3318 /// conditional-update-capture-atomic:
3319 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3320 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3321 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3322 /// IsFailOnly=true)
3323 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3324 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3325 /// IsFailOnly=true)
3326 ///
3327 /// \param Loc The insert and source location description.
3328 /// \param X The target atomic pointer to be updated.
3329 /// \param V Memory address where to store captured value (for
3330 /// compare capture only).
3331 /// \param R Memory address where to store comparison result
3332 /// (for compare capture with '==' only).
3333 /// \param E The expected value ('e') for forms that use an
3334 /// equality comparison or an expression ('expr') for
3335 /// forms that use 'ordop' (logically an atomic maximum or
3336 /// minimum).
3337 /// \param D The desired value for forms that use an equality
3338 /// comparison. If forms that use 'ordop', it should be
3339 /// \p nullptr.
3340 /// \param AO Atomic ordering of the generated atomic instructions.
3341 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3342 /// \param IsXBinopExpr True if the conditional statement is in the form where
3343 /// x is on LHS. It only matters for < or >.
3344 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3345 /// 'v', not an updated one (for compare capture
3346 /// only).
3347 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3348 /// only when the comparison fails. This is only valid for
3349 /// the case the comparison is '=='.
3350 ///
3351 /// \return Insertion point after generated atomic capture IR.
3356 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3359 AtomicOpValue &R, Value *E, Value *D,
3360 AtomicOrdering AO,
3362 bool IsXBinopExpr, bool IsPostfixUpdate,
3363 bool IsFailOnly, AtomicOrdering Failure);
3364
3365 /// Create the control flow structure of a canonical OpenMP loop.
3366 ///
3367 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3368 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3369 /// IRBuilder location is not preserved.
3370 ///
3371 /// \param DL DebugLoc used for the instructions in the skeleton.
3372 /// \param TripCount Value to be used for the trip count.
3373 /// \param F Function in which to insert the BasicBlocks.
3374 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3375 /// typically the body itself.
3376 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3377 /// \param Name Base name used to derive BB
3378 /// and instruction names.
3379 ///
3380 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3382 Function *F,
3383 BasicBlock *PreInsertBefore,
3384 BasicBlock *PostInsertBefore,
3385 const Twine &Name = {});
3386 /// OMP Offload Info Metadata name string
3387 const std::string ompOffloadInfoName = "omp_offload.info";
3388
3389 /// Loads all the offload entries information from the host IR
3390 /// metadata. This function is only meant to be used with device code
3391 /// generation.
3392 ///
3393 /// \param M Module to load Metadata info from. Module passed maybe
3394 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3396
3397 /// Loads all the offload entries information from the host IR
3398 /// metadata read from the file passed in as the HostFilePath argument. This
3399 /// function is only meant to be used with device code generation.
3400 ///
3401 /// \param HostFilePath The path to the host IR file,
3402 /// used to load in offload metadata for the device, allowing host and device
3403 /// to maintain the same metadata mapping.
3404 void loadOffloadInfoMetadata(StringRef HostFilePath);
3405
3406 /// Gets (if variable with the given name already exist) or creates
3407 /// internal global variable with the specified Name. The created variable has
3408 /// linkage CommonLinkage by default and is initialized by null value.
3409 /// \param Ty Type of the global variable. If it is exist already the type
3410 /// must be the same.
3411 /// \param Name Name of the variable.
3413 unsigned AddressSpace = 0);
3414};
3415
3416/// Class to represented the control flow structure of an OpenMP canonical loop.
3417///
3418/// The control-flow structure is standardized for easy consumption by
3419/// directives associated with loops. For instance, the worksharing-loop
3420/// construct may change this control flow such that each loop iteration is
3421/// executed on only one thread. The constraints of a canonical loop in brief
3422/// are:
3423///
3424/// * The number of loop iterations must have been computed before entering the
3425/// loop.
3426///
3427/// * Has an (unsigned) logical induction variable that starts at zero and
3428/// increments by one.
3429///
3430/// * The loop's CFG itself has no side-effects. The OpenMP specification
3431/// itself allows side-effects, but the order in which they happen, including
3432/// how often or whether at all, is unspecified. We expect that the frontend
3433/// will emit those side-effect instructions somewhere (e.g. before the loop)
3434/// such that the CanonicalLoopInfo itself can be side-effect free.
3435///
3436/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3437/// execution of a loop body that satifies these constraints. It does NOT
3438/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3439/// CanonicalLoopInfo for such purposes.
3440///
3441/// The control flow can be described as follows:
3442///
3443/// Preheader
3444/// |
3445/// /-> Header
3446/// | |
3447/// | Cond---\
3448/// | | |
3449/// | Body |
3450/// | | | |
3451/// | <...> |
3452/// | | | |
3453/// \--Latch |
3454/// |
3455/// Exit
3456/// |
3457/// After
3458///
3459/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3460/// including) and end at AfterIP (at the After's first instruction, excluding).
3461/// That is, instructions in the Preheader and After blocks (except the
3462/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3463/// side-effects. Typically, the Preheader is used to compute the loop's trip
3464/// count. The instructions from BodyIP (at the Body block's first instruction,
3465/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3466/// control and thus can have side-effects. The body block is the single entry
3467/// point into the loop body, which may contain arbitrary control flow as long
3468/// as all control paths eventually branch to the Latch block.
3469///
3470/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3471/// Latch to guarantee that there is only a single edge to the latch. It would
3472/// make loop transformations easier to not needing to consider multiple
3473/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3474/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3475/// executes after each body iteration.
3476///
3477/// There must be no loop-carried dependencies through llvm::Values. This is
3478/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3479/// for the induction variable.
3480///
3481/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3482/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3483/// by assertOK(). They are expected to not be modified unless explicitly
3484/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3485/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3486/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3487/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3488/// anymore as its underlying control flow may not exist anymore.
3489/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3490/// may also return a new CanonicalLoopInfo that can be passed to other
3491/// loop-associated construct implementing methods. These loop-transforming
3492/// methods may either create a new CanonicalLoopInfo usually using
3493/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3494/// modify one of the input CanonicalLoopInfo and return it as representing the
3495/// modified loop. What is done is an implementation detail of
3496/// transformation-implementing method and callers should always assume that the
3497/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3498/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3499/// created by createCanonicalLoop, such that transforming methods do not have
3500/// to special case where the CanonicalLoopInfo originated from.
3501///
3502/// Generally, methods consuming CanonicalLoopInfo do not need an
3503/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3504/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3505/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3506/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3507/// any InsertPoint in the Preheader, After or Block can still be used after
3508/// calling such a method.
3509///
3510/// TODO: Provide mechanisms for exception handling and cancellation points.
3511///
3512/// Defined outside OpenMPIRBuilder because nested classes cannot be
3513/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3515 friend class OpenMPIRBuilder;
3516
3517private:
3518 BasicBlock *Header = nullptr;
3519 BasicBlock *Cond = nullptr;
3520 BasicBlock *Latch = nullptr;
3521 BasicBlock *Exit = nullptr;
3522
3523 /// Add the control blocks of this loop to \p BBs.
3524 ///
3525 /// This does not include any block from the body, including the one returned
3526 /// by getBody().
3527 ///
3528 /// FIXME: This currently includes the Preheader and After blocks even though
3529 /// their content is (mostly) not under CanonicalLoopInfo's control.
3530 /// Re-evaluated whether this makes sense.
3531 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3532
3533 /// Sets the number of loop iterations to the given value. This value must be
3534 /// valid in the condition block (i.e., defined in the preheader) and is
3535 /// interpreted as an unsigned integer.
3536 void setTripCount(Value *TripCount);
3537
3538 /// Replace all uses of the canonical induction variable in the loop body with
3539 /// a new one.
3540 ///
3541 /// The intended use case is to update the induction variable for an updated
3542 /// iteration space such that it can stay normalized in the 0...tripcount-1
3543 /// range.
3544 ///
3545 /// The \p Updater is called with the (presumable updated) current normalized
3546 /// induction variable and is expected to return the value that uses of the
3547 /// pre-updated induction values should use instead, typically dependent on
3548 /// the new induction variable. This is a lambda (instead of e.g. just passing
3549 /// the new value) to be able to distinguish the uses of the pre-updated
3550 /// induction variable and uses of the induction varible to compute the
3551 /// updated induction variable value.
3552 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3553
3554public:
3555 /// Returns whether this object currently represents the IR of a loop. If
3556 /// returning false, it may have been consumed by a loop transformation or not
3557 /// been intialized. Do not use in this case;
3558 bool isValid() const { return Header; }
3559
3560 /// The preheader ensures that there is only a single edge entering the loop.
3561 /// Code that must be execute before any loop iteration can be emitted here,
3562 /// such as computing the loop trip count and begin lifetime markers. Code in
3563 /// the preheader is not considered part of the canonical loop.
3564 BasicBlock *getPreheader() const;
3565
3566 /// The header is the entry for each iteration. In the canonical control flow,
3567 /// it only contains the PHINode for the induction variable.
3569 assert(isValid() && "Requires a valid canonical loop");
3570 return Header;
3571 }
3572
3573 /// The condition block computes whether there is another loop iteration. If
3574 /// yes, branches to the body; otherwise to the exit block.
3576 assert(isValid() && "Requires a valid canonical loop");
3577 return Cond;
3578 }
3579
3580 /// The body block is the single entry for a loop iteration and not controlled
3581 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3582 /// eventually branch to the \p Latch block.
3584 assert(isValid() && "Requires a valid canonical loop");
3585 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3586 }
3587
3588 /// Reaching the latch indicates the end of the loop body code. In the
3589 /// canonical control flow, it only contains the increment of the induction
3590 /// variable.
3592 assert(isValid() && "Requires a valid canonical loop");
3593 return Latch;
3594 }
3595
3596 /// Reaching the exit indicates no more iterations are being executed.
3598 assert(isValid() && "Requires a valid canonical loop");
3599 return Exit;
3600 }
3601
3602 /// The after block is intended for clean-up code such as lifetime end
3603 /// markers. It is separate from the exit block to ensure, analogous to the
3604 /// preheader, it having just a single entry edge and being free from PHI
3605 /// nodes should there be multiple loop exits (such as from break
3606 /// statements/cancellations).
3608 assert(isValid() && "Requires a valid canonical loop");
3609 return Exit->getSingleSuccessor();
3610 }
3611
3612 /// Returns the llvm::Value containing the number of loop iterations. It must
3613 /// be valid in the preheader and always interpreted as an unsigned integer of
3614 /// any bit-width.
3616 assert(isValid() && "Requires a valid canonical loop");
3617 Instruction *CmpI = &Cond->front();
3618 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3619 return CmpI->getOperand(1);
3620 }
3621
3622 /// Returns the instruction representing the current logical induction
3623 /// variable. Always unsigned, always starting at 0 with an increment of one.
3625 assert(isValid() && "Requires a valid canonical loop");
3626 Instruction *IndVarPHI = &Header->front();
3627 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3628 return IndVarPHI;
3629 }
3630
3631 /// Return the type of the induction variable (and the trip count).
3633 assert(isValid() && "Requires a valid canonical loop");
3634 return getIndVar()->getType();
3635 }
3636
3637 /// Return the insertion point for user code before the loop.
3639 assert(isValid() && "Requires a valid canonical loop");
3640 BasicBlock *Preheader = getPreheader();
3641 return {Preheader, std::prev(Preheader->end())};
3642 };
3643
3644 /// Return the insertion point for user code in the body.
3646 assert(isValid() && "Requires a valid canonical loop");
3647 BasicBlock *Body = getBody();
3648 return {Body, Body->begin()};
3649 };
3650
3651 /// Return the insertion point for user code after the loop.
3653 assert(isValid() && "Requires a valid canonical loop");
3655 return {After, After->begin()};
3656 };
3657
3659 assert(isValid() && "Requires a valid canonical loop");
3660 return Header->getParent();
3661 }
3662
3663 /// Consistency self-check.
3664 void assertOK() const;
3665
3666 /// Invalidate this loop. That is, the underlying IR does not fulfill the
3667 /// requirements of an OpenMP canonical loop anymore.
3668 void invalidate();
3669};
3670
3671} // end namespace llvm
3672
3673#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file defines the BumpPtrAllocator interface.
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
DXIL Finalize Linkage
uint64_t Addr
std::string Name
uint32_t Index
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysis::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development, "development", "for training")))
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Value * RHS
Value * LHS
an instruction to allocate memory on the stack
Definition: Instructions.h:63
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Align AtomicAlign
Definition: Atomic.h:22
bool UseLibcall
Definition: Atomic.h:24
IRBuilderBase * Builder
Definition: Atomic.h:18
uint64_t AtomicSizeInBits
Definition: Atomic.h:20
uint64_t ValueSizeInBits
Definition: Atomic.h:21
Align ValueAlign
Definition: Atomic.h:23
Type * Ty
Definition: Atomic.h:19
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Tagged union holding either a T or a Error.
Definition: Error.h:481
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Class to represent integer types.
Definition: DerivedTypes.h:42
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
OffloadEntryInfoDeviceGlobalVar(unsigned Order, OMPTargetGlobalVarEntryKind Flags)
Definition: OMPIRBuilder.h:400
OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage, const std::string &VarName)
Definition: OMPIRBuilder.h:403
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:418
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:325
OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Definition: OMPIRBuilder.h:312
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:246
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:248
OffloadingEntryInfoKinds getKind() const
Definition: OMPIRBuilder.h:264
OffloadEntryInfo(OffloadingEntryInfoKinds Kind)
Definition: OMPIRBuilder.h:255
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:272
OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, uint32_t Flags)
Definition: OMPIRBuilder.h:256
Class that manages information about offload code regions and data.
Definition: OMPIRBuilder.h:234
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
Definition: OMPIRBuilder.h:440
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:379
@ OMPTargetDeviceClauseNoHost
The target is marked for non-host devices.
Definition: OMPIRBuilder.h:383
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:381
@ OMPTargetDeviceClauseNone
The target is marked as having no clause.
Definition: OMPIRBuilder.h:387
@ OMPTargetDeviceClauseHost
The target is marked for host devices.
Definition: OMPIRBuilder.h:385
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:299
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:301
OffloadEntriesInfoManager(OpenMPIRBuilder *builder)
Definition: OMPIRBuilder.h:292
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:290
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:359
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:365
@ OMPTargetGlobalVarEntryNone
Mark the entry as having no declare target entry kind.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:371
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:361
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
Definition: OMPIRBuilder.h:350
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:435
bool empty() const
Return true if a there are no entries defined.
Captures attributes that affect generating LLVM-IR using the OpenMPIRBuilder and related classes.
Definition: OMPIRBuilder.h:89
void setIsGPU(bool Value)
Definition: OMPIRBuilder.h:186
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:95
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
Definition: OMPIRBuilder.h:105
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:191
std::optional< StringRef > FirstSeparator
First separator used between the initial two parts of a name.
Definition: OMPIRBuilder.h:114
StringRef separator() const
Definition: OMPIRBuilder.h:177
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
void setFirstSeparator(StringRef FS)
Definition: OMPIRBuilder.h:189
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:167
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
Definition: OMPIRBuilder.h:111
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:108
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:150
SmallVector< Triple > TargetTriples
When compilation is being done for the OpenMP host (i.e.
Definition: OMPIRBuilder.h:123
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
std::optional< StringRef > Separator
Separator used between all of the rest consecutive parts of s name.
Definition: OMPIRBuilder.h:116
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:144
void setHasRequiresUnifiedAddress(bool Value)
void setOpenMPOffloadMandatory(bool Value)
Definition: OMPIRBuilder.h:188
void setIsTargetDevice(bool Value)
Definition: OMPIRBuilder.h:185
void setSeparator(StringRef S)
Definition: OMPIRBuilder.h:190
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:187
std::optional< omp::GV > GridValue
Definition: OMPIRBuilder.h:119
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
llvm::AllocaInst * CreateAlloca(llvm::Type *Ty, const llvm::Twine &Name) const override
Definition: OMPIRBuilder.h:498
AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits, uint64_t ValueSizeInBits, llvm::Align AtomicAlign, llvm::Align ValueAlign, bool UseLibcall, llvm::Value *AtomicVar)
Definition: OMPIRBuilder.h:489
void decorateWithTBAA(llvm::Instruction *I) override
Definition: OMPIRBuilder.h:497
llvm::Value * getAtomicPointer() const override
Definition: OMPIRBuilder.h:496
Struct that keeps the information that should be kept throughout a 'target data' region.
TargetDataInfo(bool RequiresDevicePointerInfo, bool SeparateBeginEndCalls)
SmallMapVector< const Value *, std::pair< Value *, Value * >, 4 > DevicePtrInfoMap
void clearArrayInfo()
Clear information about the data arrays.
unsigned NumberOfPtrs
The total number of pointers passed to the runtime library.
bool HasNoWait
Whether the target ... data directive has a nowait clause.
bool isValid()
Return true if the current target data information has valid arrays.
bool HasMapper
Indicate whether any user-defined mapper exists.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:476
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:545
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
EvalKind
Enum class for reduction evaluation types scalar, complex and aggregate.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
std::function< InsertPointOrErrorTy(InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)> ReductionGenCBTy
ReductionGen CallBack for MLIR.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
void setConfig(OpenMPIRBuilderConfig C)
Definition: OMPIRBuilder.h:511
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
std::function< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> StorableBodyGenCallbackTy
Definition: OMPIRBuilder.h:608
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
std::function< InsertPointOrErrorTy(InsertPointTy, Type *, Value *, Value *)> ReductionGenAtomicCBTy
Functions used to generate atomic reductions.
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
std::function< InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, Value **LHS, Value **RHS, Function *CurFn)> ReductionGenClangCBTy
ReductionGen CallBack for Clang.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
void pushFinalizationCB(const FinalizationInfo &FI)
Push a finalization callback on the finalization stack.
Definition: OMPIRBuilder.h:563
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy getInsertionPoint()
}
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:522
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
function_ref< InsertPointOrErrorTy(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
Definition: OMPIRBuilder.h:643
OpenMPIRBuilder(Module &M)
Create a new OpenMPIRBuilder operating on the given module M.
Definition: OMPIRBuilder.h:480
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:525
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
void popFinalizationCB()
Pop the last finalization callback from the finalization stack.
Definition: OMPIRBuilder.h:570
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
Definition: OMPIRBuilder.h:599
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
size_type count(StringRef Key) const
count - Return 1 if the element is in the map, 0 otherwise.
Definition: StringMap.h:276
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
bool pointsToAliveValue() const
Definition: ValueHandle.h:224
An efficient, type-erasing, non-owning reference to a callable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition: ISDOpcodes.h:71
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RTLDependenceKindTy
Dependence kind for RTL.
Definition: OMPConstants.h:273
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
AtomicOrdering
Atomic ordering for LLVM's memory model.
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, Value *DepVal)
omp::RTLDependenceKindTy DepKind
bool IsCancellable
Flag to indicate if the directive is cancellable.
Definition: OMPIRBuilder.h:557
FinalizeCallbackTy FiniCB
The finalization callback provided by the last in-flight invocation of createXXXX for the directive o...
Definition: OMPIRBuilder.h:550
omp::Directive DK
The directive kind of the innermost directive that has an associated region which might require final...
Definition: OMPIRBuilder.h:554
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:647
LocationDescription(const InsertPointTy &IP)
Definition: OMPIRBuilder.h:650
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
Definition: OMPIRBuilder.h:651
LocationDescription(const IRBuilderBase &IRB)
Definition: OMPIRBuilder.h:648
This structure contains combined information generated for mappable clauses, including base pointers,...
void append(MapInfosTy &CurInfo)
Append arrays in CurInfo.
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
Function * getFunction() const
Return the function that contains the region to be outlined.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
std::function< void(Function &)> PostOutlineCBTy
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
ReductionInfo(Value *PrivateVariable)
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, ReductionGenAtomicCBTy AtomicReductionGen)
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, Value *SizesArray, Value *MapTypesArray, Value *MapTypesArrayEnd, Value *MappersArray, Value *MapNamesArray)
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, Value *NumIterations, ArrayRef< Value * > NumTeams, ArrayRef< Value * > NumThreads, Value *DynCGGroupMem, bool HasNoWait)
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:205
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
Definition: OMPIRBuilder.h:207
bool operator<(const TargetRegionEntryInfo &RHS) const
Definition: OMPIRBuilder.h:226
TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count=0)
Definition: OMPIRBuilder.h:216
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57