LLVM 22.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
20#include "llvm/IR/DebugLoc.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/Module.h"
23#include "llvm/IR/ValueMap.h"
26#include "llvm/Support/Error.h"
28#include <forward_list>
29#include <map>
30#include <optional>
31
32namespace llvm {
33class CanonicalLoopInfo;
34class ScanInfo;
35struct TargetRegionEntryInfo;
36class OffloadEntriesInfoManager;
37class OpenMPIRBuilder;
38class Loop;
39class LoopAnalysis;
40class LoopInfo;
41
42namespace vfs {
43class FileSystem;
44} // namespace vfs
45
46/// Move the instruction after an InsertPoint to the beginning of another
47/// BasicBlock.
48///
49/// The instructions after \p IP are moved to the beginning of \p New which must
50/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
51/// \p New will be added such that there is no semantic change. Otherwise, the
52/// \p IP insert block remains degenerate and it is up to the caller to insert a
53/// terminator. \p DL is used as the debug location for the branch instruction
54/// if one is created.
56 bool CreateBranch, DebugLoc DL);
57
58/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
59/// insert location will stick to after the instruction before the insertion
60/// point (instead of moving with the instruction the InsertPoint stores
61/// internally).
62LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New,
63 bool CreateBranch);
64
65/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
66/// (missing the terminator).
67///
68/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
69/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
70/// is true, a branch to the new successor will new created such that
71/// semantically there is no change; otherwise the block of the insertion point
72/// remains degenerate and it is the caller's responsibility to insert a
73/// terminator. \p DL is used as the debug location for the branch instruction
74/// if one is created. Returns the new successor block.
75LLVM_ABI BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
76 DebugLoc DL, llvm::Twine Name = {});
77
78/// Split a BasicBlock at \p Builder's insertion point, even if the block is
79/// degenerate (missing the terminator). Its new insert location will stick to
80/// after the instruction before the insertion point (instead of moving with the
81/// instruction the InsertPoint stores internally).
82LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
83 llvm::Twine Name = {});
84
85/// Split a BasicBlock at \p Builder's insertion point, even if the block is
86/// degenerate (missing the terminator). Its new insert location will stick to
87/// after the instruction before the insertion point (instead of moving with the
88/// instruction the InsertPoint stores internally).
89LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
90 llvm::Twine Name);
91
92/// Like splitBB, but reuses the current block's name for the new name.
93LLVM_ABI BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder,
94 bool CreateBranch,
95 llvm::Twine Suffix = ".split");
96
97/// Captures attributes that affect generating LLVM-IR using the
98/// OpenMPIRBuilder and related classes. Note that not all attributes are
99/// required for all classes or functions. In some use cases the configuration
100/// is not necessary at all, because because the only functions that are called
101/// are ones that are not dependent on the configuration.
102class OpenMPIRBuilderConfig {
103public:
104 /// Flag to define whether to generate code for the role of the OpenMP host
105 /// (if set to false) or device (if set to true) in an offloading context. It
106 /// is set when the -fopenmp-is-target-device compiler frontend option is
107 /// specified.
108 std::optional<bool> IsTargetDevice;
109
110 /// Flag for specifying if the compilation is done for an accelerator. It is
111 /// set according to the architecture of the target triple and currently only
112 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
113 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
114 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
115 /// like target with the ability to work as the OpenMP host is added, or if
116 /// the capabilities of the currently supported GPU architectures are
117 /// expanded.
118 std::optional<bool> IsGPU;
119
120 /// Flag for specifying if LLVMUsed information should be emitted.
121 std::optional<bool> EmitLLVMUsedMetaInfo;
122
123 /// Flag for specifying if offloading is mandatory.
124 std::optional<bool> OpenMPOffloadMandatory;
125
126 /// First separator used between the initial two parts of a name.
127 std::optional<StringRef> FirstSeparator;
128 /// Separator used between all of the rest consecutive parts of s name.
129 std::optional<StringRef> Separator;
130
131 // Grid Value for the GPU target.
132 std::optional<omp::GV> GridValue;
133
134 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
135 /// false`), this contains the list of offloading triples associated, if any.
136 SmallVector<Triple> TargetTriples;
137
138 // Default address space for the target.
139 unsigned DefaultTargetAS = 0;
140
141 LLVM_ABI OpenMPIRBuilderConfig();
142 LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
143 bool OpenMPOffloadMandatory,
144 bool HasRequiresReverseOffload,
145 bool HasRequiresUnifiedAddress,
146 bool HasRequiresUnifiedSharedMemory,
147 bool HasRequiresDynamicAllocators);
148
149 // Getters functions that assert if the required values are not present.
150 bool isTargetDevice() const {
151 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
152 return *IsTargetDevice;
153 }
154
155 bool isGPU() const {
156 assert(IsGPU.has_value() && "IsGPU is not set");
157 return *IsGPU;
158 }
159
160 bool openMPOffloadMandatory() const {
161 assert(OpenMPOffloadMandatory.has_value() &&
162 "OpenMPOffloadMandatory is not set");
163 return *OpenMPOffloadMandatory;
164 }
165
166 omp::GV getGridValue() const {
167 assert(GridValue.has_value() && "GridValue is not set");
168 return *GridValue;
169 }
170
171 unsigned getDefaultTargetAS() const { return DefaultTargetAS; }
172
173 bool hasRequiresFlags() const { return RequiresFlags; }
174 LLVM_ABI bool hasRequiresReverseOffload() const;
175 LLVM_ABI bool hasRequiresUnifiedAddress() const;
176 LLVM_ABI bool hasRequiresUnifiedSharedMemory() const;
177 LLVM_ABI bool hasRequiresDynamicAllocators() const;
178
179 /// Returns requires directive clauses as flags compatible with those expected
180 /// by libomptarget.
181 LLVM_ABI int64_t getRequiresFlags() const;
182
183 // Returns the FirstSeparator if set, otherwise use the default separator
184 // depending on isGPU
185 StringRef firstSeparator() const {
186 if (FirstSeparator.has_value())
187 return *FirstSeparator;
188 if (isGPU())
189 return "_";
190 return ".";
191 }
192
193 // Returns the Separator if set, otherwise use the default separator depending
194 // on isGPU
195 StringRef separator() const {
196 if (Separator.has_value())
197 return *Separator;
198 if (isGPU())
199 return "$";
200 return ".";
201 }
202
203 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
204 void setIsGPU(bool Value) { IsGPU = Value; }
205 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
206 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
207 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
208 void setSeparator(StringRef S) { Separator = S; }
209 void setGridValue(omp::GV G) { GridValue = G; }
210 void setDefaultTargetAS(unsigned AS) { DefaultTargetAS = AS; }
211
212 LLVM_ABI void setHasRequiresReverseOffload(bool Value);
213 LLVM_ABI void setHasRequiresUnifiedAddress(bool Value);
214 LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value);
215 LLVM_ABI void setHasRequiresDynamicAllocators(bool Value);
216
217private:
218 /// Flags for specifying which requires directive clauses are present.
219 int64_t RequiresFlags;
220};
221
222/// Data structure to contain the information needed to uniquely identify
223/// a target entry.
224struct TargetRegionEntryInfo {
225 /// The prefix used for kernel names.
226 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
227
228 std::string ParentName;
229 unsigned DeviceID;
230 unsigned FileID;
231 unsigned Line;
232 unsigned Count;
233
234 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
235 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
236 unsigned FileID, unsigned Line, unsigned Count = 0)
237 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
238 Count(Count) {}
239
240 LLVM_ABI static void
241 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, StringRef ParentName,
242 unsigned DeviceID, unsigned FileID, unsigned Line,
243 unsigned Count);
244
245 bool operator<(const TargetRegionEntryInfo &RHS) const {
246 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
247 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
248 RHS.Count);
249 }
250};
251
252/// Class that manages information about offload code regions and data
253class OffloadEntriesInfoManager {
254 /// Number of entries registered so far.
255 OpenMPIRBuilder *OMPBuilder;
256 unsigned OffloadingEntriesNum = 0;
257
258public:
259 /// Base class of the entries info.
260 class OffloadEntryInfo {
261 public:
262 /// Kind of a given entry.
263 enum OffloadingEntryInfoKinds : unsigned {
264 /// Entry is a target region.
265 OffloadingEntryInfoTargetRegion = 0,
266 /// Entry is a declare target variable.
267 OffloadingEntryInfoDeviceGlobalVar = 1,
268 /// Invalid entry info.
269 OffloadingEntryInfoInvalid = ~0u
270 };
271
272 protected:
273 OffloadEntryInfo() = delete;
274 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
275 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
276 uint32_t Flags)
277 : Flags(Flags), Order(Order), Kind(Kind) {}
278 ~OffloadEntryInfo() = default;
279
280 public:
281 bool isValid() const { return Order != ~0u; }
282 unsigned getOrder() const { return Order; }
283 OffloadingEntryInfoKinds getKind() const { return Kind; }
284 uint32_t getFlags() const { return Flags; }
285 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
286 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
287 void setAddress(Constant *V) {
288 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
289 Addr = V;
290 }
291 static bool classof(const OffloadEntryInfo *Info) { return true; }
292
293 private:
294 /// Address of the entity that has to be mapped for offloading.
295 WeakTrackingVH Addr;
296
297 /// Flags associated with the device global.
298 uint32_t Flags = 0u;
299
300 /// Order this entry was emitted.
301 unsigned Order = ~0u;
302
303 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
304 };
305
306 /// Return true if a there are no entries defined.
307 LLVM_ABI bool empty() const;
308 /// Return number of entries defined so far.
309 unsigned size() const { return OffloadingEntriesNum; }
310
311 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
312
313 //
314 // Target region entries related.
315 //
316
317 /// Kind of the target registry entry.
318 enum OMPTargetRegionEntryKind : uint32_t {
319 /// Mark the entry as target region.
320 OMPTargetRegionEntryTargetRegion = 0x0,
321 };
322
323 /// Target region entries info.
324 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
325 /// Address that can be used as the ID of the entry.
326 Constant *ID = nullptr;
327
328 public:
329 OffloadEntryInfoTargetRegion()
330 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
331 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
332 Constant *ID,
333 OMPTargetRegionEntryKind Flags)
334 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
335 ID(ID) {
336 setAddress(Addr);
337 }
338
339 Constant *getID() const { return ID; }
340 void setID(Constant *V) {
341 assert(!ID && "ID has been set before!");
342 ID = V;
343 }
344 static bool classof(const OffloadEntryInfo *Info) {
345 return Info->getKind() == OffloadingEntryInfoTargetRegion;
346 }
347 };
348
349 /// Initialize target region entry.
350 /// This is ONLY needed for DEVICE compilation.
351 LLVM_ABI void
352 initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
353 unsigned Order);
354 /// Register target region entry.
355 LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
356 Constant *Addr, Constant *ID,
357 OMPTargetRegionEntryKind Flags);
358 /// Return true if a target region entry with the provided information
359 /// exists.
360 LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
361 bool IgnoreAddressId = false) const;
362
363 // Return the Name based on \a EntryInfo using the next available Count.
364 LLVM_ABI void
365 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
366 const TargetRegionEntryInfo &EntryInfo);
367
368 /// brief Applies action \a Action on all registered entries.
369 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
370 const OffloadEntryInfoTargetRegion &)>
371 OffloadTargetRegionEntryInfoActTy;
372 LLVM_ABI void
373 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
374
375 //
376 // Device global variable entries related.
377 //
378
379 /// Kind of the global variable entry..
380 enum OMPTargetGlobalVarEntryKind : uint32_t {
381 /// Mark the entry as a to declare target.
382 OMPTargetGlobalVarEntryTo = 0x0,
383 /// Mark the entry as a to declare target link.
384 OMPTargetGlobalVarEntryLink = 0x1,
385 /// Mark the entry as a declare target enter.
386 OMPTargetGlobalVarEntryEnter = 0x2,
387 /// Mark the entry as having no declare target entry kind.
388 OMPTargetGlobalVarEntryNone = 0x3,
389 /// Mark the entry as a declare target indirect global.
390 OMPTargetGlobalVarEntryIndirect = 0x8,
391 /// Mark the entry as a register requires global.
392 OMPTargetGlobalRegisterRequires = 0x10,
393 };
394
395 /// Kind of device clause for declare target variables
396 /// and functions
397 /// NOTE: Currently not used as a part of a variable entry
398 /// used for Flang and Clang to interface with the variable
399 /// related registration functions
400 enum OMPTargetDeviceClauseKind : uint32_t {
401 /// The target is marked for all devices
402 OMPTargetDeviceClauseAny = 0x0,
403 /// The target is marked for non-host devices
404 OMPTargetDeviceClauseNoHost = 0x1,
405 /// The target is marked for host devices
406 OMPTargetDeviceClauseHost = 0x2,
407 /// The target is marked as having no clause
408 OMPTargetDeviceClauseNone = 0x3
409 };
410
411 /// Device global variable entries info.
412 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
413 /// Type of the global variable.
414 int64_t VarSize;
415 GlobalValue::LinkageTypes Linkage;
416 const std::string VarName;
417
418 public:
419 OffloadEntryInfoDeviceGlobalVar()
420 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
421 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
422 OMPTargetGlobalVarEntryKind Flags)
423 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
424 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
425 int64_t VarSize,
426 OMPTargetGlobalVarEntryKind Flags,
427 GlobalValue::LinkageTypes Linkage,
428 const std::string &VarName)
429 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
430 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
431 setAddress(Addr);
432 }
433
434 int64_t getVarSize() const { return VarSize; }
435 StringRef getVarName() const { return VarName; }
436 void setVarSize(int64_t Size) { VarSize = Size; }
437 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
438 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
439 static bool classof(const OffloadEntryInfo *Info) {
440 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
441 }
442 };
443
444 /// Initialize device global variable entry.
445 /// This is ONLY used for DEVICE compilation.
446 LLVM_ABI void initializeDeviceGlobalVarEntryInfo(
447 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order);
448
449 /// Register device global variable entry.
450 LLVM_ABI void registerDeviceGlobalVarEntryInfo(
451 StringRef VarName, Constant *Addr, int64_t VarSize,
452 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage);
453 /// Checks if the variable with the given name has been registered already.
454 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
455 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
456 }
457 /// Applies action \a Action on all registered entries.
458 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
459 OffloadDeviceGlobalVarEntryInfoActTy;
460 LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(
461 const OffloadDeviceGlobalVarEntryInfoActTy &Action);
462
463private:
464 /// Return the count of entries at a particular source location.
465 unsigned
466 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
467
468 /// Update the count of entries at a particular source location.
469 void
470 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
471
472 static TargetRegionEntryInfo
473 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
474 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
475 EntryInfo.FileID, EntryInfo.Line, 0);
476 }
477
478 // Count of entries at a location.
479 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
480
481 // Storage for target region entries kind.
482 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
483 OffloadEntriesTargetRegionTy;
484 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
485 /// Storage for device global variable entries kind. The storage is to be
486 /// indexed by mangled name.
487 typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
488 OffloadEntriesDeviceGlobalVarTy;
489 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
490};
491
492/// An interface to create LLVM-IR for OpenMP directives.
493///
494/// Each OpenMP directive has a corresponding public generator method.
495class OpenMPIRBuilder {
496public:
497 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
498 /// not have an effect on \p M (see initialize)
499 OpenMPIRBuilder(Module &M)
500 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
501 T(M.getTargetTriple()), IsFinalized(false) {}
502 LLVM_ABI ~OpenMPIRBuilder();
503
504 class AtomicInfo : public llvm::AtomicInfo {
505 llvm::Value *AtomicVar;
506
507 public:
508 AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits,
509 uint64_t ValueSizeInBits, llvm::Align AtomicAlign,
510 llvm::Align ValueAlign, bool UseLibcall,
511 IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar)
512 : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits,
513 AtomicAlign, ValueAlign, UseLibcall, AllocaIP),
514 AtomicVar(AtomicVar) {}
515
516 llvm::Value *getAtomicPointer() const override { return AtomicVar; }
517 void decorateWithTBAA(llvm::Instruction *I) override {}
518 llvm::AllocaInst *CreateAlloca(llvm::Type *Ty,
519 const llvm::Twine &Name) const override {
520 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
521 allocaInst->setName(Name);
522 return allocaInst;
523 }
524 };
525 /// Initialize the internal state, this will put structures types and
526 /// potentially other helpers into the underlying module. Must be called
527 /// before any other method and only once! This internal state includes types
528 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
529 LLVM_ABI void initialize();
530
531 void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
532
533 /// Finalize the underlying module, e.g., by outlining regions.
534 /// \param Fn The function to be finalized. If not used,
535 /// all functions are finalized.
536 LLVM_ABI void finalize(Function *Fn = nullptr);
537
538 /// Check whether the finalize function has already run
539 /// \return true if the finalize function has already run
540 LLVM_ABI bool isFinalized();
541
542 /// Add attributes known for \p FnID to \p Fn.
543 LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
544
545 /// Type used throughout for insertion points.
546 using InsertPointTy = IRBuilder<>::InsertPoint;
547
548 /// Type used to represent an insertion point or an error value.
549 using InsertPointOrErrorTy = Expected<InsertPointTy>;
550
551 /// Get the create a name using the platform specific separators.
552 /// \param Parts parts of the final name that needs separation
553 /// The created name has a first separator between the first and second part
554 /// and a second separator between all other parts.
555 /// E.g. with FirstSeparator "$" and Separator "." and
556 /// parts: "p1", "p2", "p3", "p4"
557 /// The resulting name is "p1$p2.p3.p4"
558 /// The separators are retrieved from the OpenMPIRBuilderConfig.
559 LLVM_ABI std::string
560 createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
561
562 /// Callback type for variable finalization (think destructors).
563 ///
564 /// \param CodeGenIP is the insertion point at which the finalization code
565 /// should be placed.
566 ///
567 /// A finalize callback knows about all objects that need finalization, e.g.
568 /// destruction, when the scope of the currently generated construct is left
569 /// at the time, and location, the callback is invoked.
570 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
571
572 struct FinalizationInfo {
573 /// The finalization callback provided by the last in-flight invocation of
574 /// createXXXX for the directive of kind DK.
575 FinalizeCallbackTy FiniCB;
576
577 /// The directive kind of the innermost directive that has an associated
578 /// region which might require finalization when it is left.
579 omp::Directive DK;
580
581 /// Flag to indicate if the directive is cancellable.
582 bool IsCancellable;
583 };
584
585 /// Push a finalization callback on the finalization stack.
586 ///
587 /// NOTE: Temporary solution until Clang CG is gone.
588 void pushFinalizationCB(const FinalizationInfo &FI) {
589 FinalizationStack.push_back(FI);
590 }
591
592 /// Pop the last finalization callback from the finalization stack.
593 ///
594 /// NOTE: Temporary solution until Clang CG is gone.
595 void popFinalizationCB() { FinalizationStack.pop_back(); }
596
597 /// Callback type for body (=inner region) code generation
598 ///
599 /// The callback takes code locations as arguments, each describing a
600 /// location where additional instructions can be inserted.
601 ///
602 /// The CodeGenIP may be in the middle of a basic block or point to the end of
603 /// it. The basic block may have a terminator or be degenerate. The callback
604 /// function may just insert instructions at that position, but also split the
605 /// block (without the Before argument of BasicBlock::splitBasicBlock such
606 /// that the identify of the split predecessor block is preserved) and insert
607 /// additional control flow, including branches that do not lead back to what
608 /// follows the CodeGenIP. Note that since the callback is allowed to split
609 /// the block, callers must assume that InsertPoints to positions in the
610 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
611 /// such InsertPoints need to be preserved, it can split the block itself
612 /// before calling the callback.
613 ///
614 /// AllocaIP and CodeGenIP must not point to the same position.
615 ///
616 /// \param AllocaIP is the insertion point at which new alloca instructions
617 /// should be placed. The BasicBlock it is pointing to must
618 /// not be split.
619 /// \param CodeGenIP is the insertion point at which the body code should be
620 /// placed.
621 ///
622 /// \return an error, if any were triggered during execution.
623 using BodyGenCallbackTy =
624 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
625
626 // This is created primarily for sections construct as llvm::function_ref
627 // (BodyGenCallbackTy) is not storable (as described in the comments of
628 // function_ref class - function_ref contains non-ownable reference
629 // to the callable.
630 ///
631 /// \return an error, if any were triggered during execution.
632 using StorableBodyGenCallbackTy =
633 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
634
635 /// Callback type for loop body code generation.
636 ///
637 /// \param CodeGenIP is the insertion point where the loop's body code must be
638 /// placed. This will be a dedicated BasicBlock with a
639 /// conditional branch from the loop condition check and
640 /// terminated with an unconditional branch to the loop
641 /// latch.
642 /// \param IndVar is the induction variable usable at the insertion point.
643 ///
644 /// \return an error, if any were triggered during execution.
645 using LoopBodyGenCallbackTy =
646 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
647
648 /// Callback type for variable privatization (think copy & default
649 /// constructor).
650 ///
651 /// \param AllocaIP is the insertion point at which new alloca instructions
652 /// should be placed.
653 /// \param CodeGenIP is the insertion point at which the privatization code
654 /// should be placed.
655 /// \param Original The value being copied/created, should not be used in the
656 /// generated IR.
657 /// \param Inner The equivalent of \p Original that should be used in the
658 /// generated IR; this is equal to \p Original if the value is
659 /// a pointer and can thus be passed directly, otherwise it is
660 /// an equivalent but different value.
661 /// \param ReplVal The replacement value, thus a copy or new created version
662 /// of \p Inner.
663 ///
664 /// \returns The new insertion point where code generation continues and
665 /// \p ReplVal the replacement value.
666 using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy(
667 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
668 Value &Inner, Value *&ReplVal)>;
669
670 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
671 /// (filename, line, column, ...).
672 struct LocationDescription {
673 LocationDescription(const IRBuilderBase &IRB)
674 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
675 LocationDescription(const InsertPointTy &IP) : IP(IP) {}
676 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
677 : IP(IP), DL(DL) {}
678 InsertPointTy IP;
679 DebugLoc DL;
680 };
681
682 /// Emitter methods for OpenMP directives.
683 ///
684 ///{
685
686 /// Generator for '#omp barrier'
687 ///
688 /// \param Loc The location where the barrier directive was encountered.
689 /// \param Kind The kind of directive that caused the barrier.
690 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
691 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
692 /// should be checked and acted upon.
693 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
694 ///
695 /// \returns The insertion point after the barrier.
696 LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc,
697 omp::Directive Kind,
698 bool ForceSimpleCall = false,
699 bool CheckCancelFlag = true);
700
701 /// Generator for '#omp cancel'
702 ///
703 /// \param Loc The location where the directive was encountered.
704 /// \param IfCondition The evaluated 'if' clause expression, if any.
705 /// \param CanceledDirective The kind of directive that is cancled.
706 ///
707 /// \returns The insertion point after the barrier.
708 LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc,
709 Value *IfCondition,
710 omp::Directive CanceledDirective);
711
712 /// Generator for '#omp cancellation point'
713 ///
714 /// \param Loc The location where the directive was encountered.
715 /// \param CanceledDirective The kind of directive that is cancled.
716 ///
717 /// \returns The insertion point after the barrier.
718 LLVM_ABI InsertPointOrErrorTy createCancellationPoint(
719 const LocationDescription &Loc, omp::Directive CanceledDirective);
720
721 /// Creates a ScanInfo object, allocates and returns the pointer.
722 LLVM_ABI Expected<ScanInfo *> scanInfoInitialize();
723
724 /// Generator for '#omp parallel'
725 ///
726 /// \param Loc The insert and source location description.
727 /// \param AllocaIP The insertion points to be used for alloca instructions.
728 /// \param BodyGenCB Callback that will generate the region code.
729 /// \param PrivCB Callback to copy a given variable (think copy constructor).
730 /// \param FiniCB Callback to finalize variable copies.
731 /// \param IfCondition The evaluated 'if' clause expression, if any.
732 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
733 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
734 /// \param IsCancellable Flag to indicate a cancellable parallel region.
735 ///
736 /// \returns The insertion position *after* the parallel.
737 LLVM_ABI InsertPointOrErrorTy createParallel(
738 const LocationDescription &Loc, InsertPointTy AllocaIP,
739 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
740 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
741 omp::ProcBindKind ProcBind, bool IsCancellable);
742
743 /// Generator for the control flow structure of an OpenMP canonical loop.
744 ///
745 /// This generator operates on the logical iteration space of the loop, i.e.
746 /// the caller only has to provide a loop trip count of the loop as defined by
747 /// base language semantics. The trip count is interpreted as an unsigned
748 /// integer. The induction variable passed to \p BodyGenCB will be of the same
749 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
750 /// convert the logical iteration variable to the loop counter variable in the
751 /// loop body.
752 ///
753 /// \param Loc The insert and source location description. The insert
754 /// location can be between two instructions or the end of a
755 /// degenerate block (e.g. a BB under construction).
756 /// \param BodyGenCB Callback that will generate the loop body code.
757 /// \param TripCount Number of iterations the loop body is executed.
758 /// \param Name Base name used to derive BB and instruction names.
759 ///
760 /// \returns An object representing the created control flow structure which
761 /// can be used for loop-associated directives.
762 LLVM_ABI Expected<CanonicalLoopInfo *>
763 createCanonicalLoop(const LocationDescription &Loc,
764 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
765 const Twine &Name = "loop");
766
767 /// Generator for the control flow structure of an OpenMP canonical loops if
768 /// the parent directive has an `inscan` modifier specified.
769 /// If the `inscan` modifier is specified, the region of the parent is
770 /// expected to have a `scan` directive. Based on the clauses in
771 /// scan directive, the body of the loop is split into two loops: Input loop
772 /// and Scan Loop. Input loop contains the code generated for input phase of
773 /// scan and Scan loop contains the code generated for scan phase of scan.
774 /// From the bodyGen callback of these loops, `createScan` would be called
775 /// when a scan directive is encountered from the loop body. `createScan`
776 /// based on whether 1. inclusive or exclusive scan is specified and, 2. input
777 /// loop or scan loop is generated, lowers the body of the for loop
778 /// accordingly.
779 ///
780 /// \param Loc The insert and source location description.
781 /// \param BodyGenCB Callback that will generate the loop body code.
782 /// \param Start Value of the loop counter for the first iterations.
783 /// \param Stop Loop counter values past this will stop the loop.
784 /// \param Step Loop counter increment after each iteration; negative
785 /// means counting down.
786 /// \param IsSigned Whether Start, Stop and Step are signed integers.
787 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
788 /// counter.
789 /// \param ComputeIP Insertion point for instructions computing the trip
790 /// count. Can be used to ensure the trip count is available
791 /// at the outermost loop of a loop nest. If not set,
792 /// defaults to the preheader of the generated loop.
793 /// \param Name Base name used to derive BB and instruction names.
794 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
795 /// `ScanInfoInitialize`.
796 ///
797 /// \returns A vector containing Loop Info of Input Loop and Scan Loop.
798 LLVM_ABI Expected<SmallVector<llvm::CanonicalLoopInfo *>>
799 createCanonicalScanLoops(const LocationDescription &Loc,
800 LoopBodyGenCallbackTy BodyGenCB, Value *Start,
801 Value *Stop, Value *Step, bool IsSigned,
802 bool InclusiveStop, InsertPointTy ComputeIP,
803 const Twine &Name, ScanInfo *ScanRedInfo);
804
805 /// Calculate the trip count of a canonical loop.
806 ///
807 /// This allows specifying user-defined loop counter values using increment,
808 /// upper- and lower bounds. To disambiguate the terminology when counting
809 /// downwards, instead of lower bounds we use \p Start for the loop counter
810 /// value in the first body iteration.
811 ///
812 /// Consider the following limitations:
813 ///
814 /// * A loop counter space over all integer values of its bit-width cannot be
815 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
816 /// stored into an 8 bit integer):
817 ///
818 /// DO I = 0, 255, 1
819 ///
820 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
821 /// effectively counting downwards:
822 ///
823 /// for (uint8_t i = 100u; i > 0; i += 127u)
824 ///
825 ///
826 /// TODO: May need to add additional parameters to represent:
827 ///
828 /// * Allow representing downcounting with unsigned integers.
829 ///
830 /// * Sign of the step and the comparison operator might disagree:
831 ///
832 /// for (int i = 0; i < 42; i -= 1u)
833 ///
834 /// \param Loc The insert and source location description.
835 /// \param Start Value of the loop counter for the first iterations.
836 /// \param Stop Loop counter values past this will stop the loop.
837 /// \param Step Loop counter increment after each iteration; negative
838 /// means counting down.
839 /// \param IsSigned Whether Start, Stop and Step are signed integers.
840 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
841 /// counter.
842 /// \param Name Base name used to derive instruction names.
843 ///
844 /// \returns The value holding the calculated trip count.
845 LLVM_ABI Value *calculateCanonicalLoopTripCount(
846 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
847 bool IsSigned, bool InclusiveStop, const Twine &Name = "loop");
848
849 /// Generator for the control flow structure of an OpenMP canonical loop.
850 ///
851 /// Instead of a logical iteration space, this allows specifying user-defined
852 /// loop counter values using increment, upper- and lower bounds. To
853 /// disambiguate the terminology when counting downwards, instead of lower
854 /// bounds we use \p Start for the loop counter value in the first body
855 ///
856 /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
857 /// so limitations of that method apply here as well.
858 ///
859 /// \param Loc The insert and source location description.
860 /// \param BodyGenCB Callback that will generate the loop body code.
861 /// \param Start Value of the loop counter for the first iterations.
862 /// \param Stop Loop counter values past this will stop the loop.
863 /// \param Step Loop counter increment after each iteration; negative
864 /// means counting down.
865 /// \param IsSigned Whether Start, Stop and Step are signed integers.
866 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
867 /// counter.
868 /// \param ComputeIP Insertion point for instructions computing the trip
869 /// count. Can be used to ensure the trip count is available
870 /// at the outermost loop of a loop nest. If not set,
871 /// defaults to the preheader of the generated loop.
872 /// \param Name Base name used to derive BB and instruction names.
873 /// \param InScan Whether loop has a scan reduction specified.
874 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
875 /// `ScanInfoInitialize`.
876 ///
877 /// \returns An object representing the created control flow structure which
878 /// can be used for loop-associated directives.
879 LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop(
880 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
881 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
882 InsertPointTy ComputeIP = {}, const Twine &Name = "loop",
883 bool InScan = false, ScanInfo *ScanRedInfo = nullptr);
884
885 /// Collapse a loop nest into a single loop.
886 ///
887 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
888 /// that has the same number of innermost loop iterations as the origin loop
889 /// nest. The induction variables of the input loops are derived from the
890 /// collapsed loop's induction variable. This is intended to be used to
891 /// implement OpenMP's collapse clause. Before applying a directive,
892 /// collapseLoops normalizes a loop nest to contain only a single loop and the
893 /// directive's implementation does not need to handle multiple loops itself.
894 /// This does not remove the need to handle all loop nest handling by
895 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
896 /// modifier of the worksharing-loop directive.
897 ///
898 /// Example:
899 /// \code
900 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
901 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
902 /// body(i, j);
903 /// \endcode
904 ///
905 /// After collapsing with Loops={i,j}, the loop is changed to
906 /// \code
907 /// for (int ij = 0; ij < 63; ++ij) {
908 /// int i = ij / 9;
909 /// int j = ij % 9;
910 /// body(i, j);
911 /// }
912 /// \endcode
913 ///
914 /// In the current implementation, the following limitations apply:
915 ///
916 /// * All input loops have an induction variable of the same type.
917 ///
918 /// * The collapsed loop will have the same trip count integer type as the
919 /// input loops. Therefore it is possible that the collapsed loop cannot
920 /// represent all iterations of the input loops. For instance, assuming a
921 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
922 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
923 /// which cannot be represented in an 32-bit integer. Behavior is undefined
924 /// in this case.
925 ///
926 /// * The trip counts of every input loop must be available at \p ComputeIP.
927 /// Non-rectangular loops are not yet supported.
928 ///
929 /// * At each nest level, code between a surrounding loop and its nested loop
930 /// is hoisted into the loop body, and such code will be executed more
931 /// often than before collapsing (or not at all if any inner loop iteration
932 /// has a trip count of 0). This is permitted by the OpenMP specification.
933 ///
934 /// \param DL Debug location for instructions added for collapsing,
935 /// such as instructions to compute/derive the input loop's
936 /// induction variables.
937 /// \param Loops Loops in the loop nest to collapse. Loops are specified
938 /// from outermost-to-innermost and every control flow of a
939 /// loop's body must pass through its directly nested loop.
940 /// \param ComputeIP Where additional instruction that compute the collapsed
941 /// trip count. If not set, defaults to before the generated
942 /// loop.
943 ///
944 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
945 LLVM_ABI CanonicalLoopInfo *collapseLoops(DebugLoc DL,
946 ArrayRef<CanonicalLoopInfo *> Loops,
947 InsertPointTy ComputeIP);
948
949 /// Get the default alignment value for given target
950 ///
951 /// \param TargetTriple Target triple
952 /// \param Features StringMap which describes extra CPU features
953 LLVM_ABI static unsigned
954 getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
955 const StringMap<bool> &Features);
956
957 /// Retrieve (or create if non-existent) the address of a declare
958 /// target variable, used in conjunction with registerTargetGlobalVariable
959 /// to create declare target global variables.
960 ///
961 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
962 /// clause used in conjunction with the variable being registered (link,
963 /// to, enter).
964 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
965 /// clause used in conjunction with the variable being registered (nohost,
966 /// host, any)
967 /// \param IsDeclaration - boolean stating if the variable being registered
968 /// is a declaration-only and not a definition
969 /// \param IsExternallyVisible - boolean stating if the variable is externally
970 /// visible
971 /// \param EntryInfo - Unique entry information for the value generated
972 /// using getTargetEntryUniqueInfo, used to name generated pointer references
973 /// to the declare target variable
974 /// \param MangledName - the mangled name of the variable being registered
975 /// \param GeneratedRefs - references generated by invocations of
976 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
977 /// these are required by Clang for book keeping.
978 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
979 /// \param TargetTriple - The OpenMP device target triple we are compiling
980 /// for
981 /// \param LlvmPtrTy - The type of the variable we are generating or
982 /// retrieving an address for
983 /// \param GlobalInitializer - a lambda function which creates a constant
984 /// used for initializing a pointer reference to the variable in certain
985 /// cases. If a nullptr is passed, it will default to utilising the original
986 /// variable to initialize the pointer reference.
987 /// \param VariableLinkage - a lambda function which returns the variables
988 /// linkage type, if unspecified and a nullptr is given, it will instead
989 /// utilise the linkage stored on the existing global variable in the
990 /// LLVMModule.
991 LLVM_ABI Constant *getAddrOfDeclareTargetVar(
992 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
993 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
994 bool IsDeclaration, bool IsExternallyVisible,
995 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
996 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
997 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
998 std::function<Constant *()> GlobalInitializer,
999 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
1000
1001 /// Registers a target variable for device or host.
1002 ///
1003 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
1004 /// clause used in conjunction with the variable being registered (link,
1005 /// to, enter).
1006 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1007 /// clause used in conjunction with the variable being registered (nohost,
1008 /// host, any)
1009 /// \param IsDeclaration - boolean stating if the variable being registered
1010 /// is a declaration-only and not a definition
1011 /// \param IsExternallyVisible - boolean stating if the variable is externally
1012 /// visible
1013 /// \param EntryInfo - Unique entry information for the value generated
1014 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1015 /// to the declare target variable
1016 /// \param MangledName - the mangled name of the variable being registered
1017 /// \param GeneratedRefs - references generated by invocations of
1018 /// registerTargetGlobalVariable these are required by Clang for book
1019 /// keeping.
1020 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1021 /// \param TargetTriple - The OpenMP device target triple we are compiling
1022 /// for
1023 /// \param GlobalInitializer - a lambda function which creates a constant
1024 /// used for initializing a pointer reference to the variable in certain
1025 /// cases. If a nullptr is passed, it will default to utilising the original
1026 /// variable to initialize the pointer reference.
1027 /// \param VariableLinkage - a lambda function which returns the variables
1028 /// linkage type, if unspecified and a nullptr is given, it will instead
1029 /// utilise the linkage stored on the existing global variable in the
1030 /// LLVMModule.
1031 /// \param LlvmPtrTy - The type of the variable we are generating or
1032 /// retrieving an address for
1033 /// \param Addr - the original llvm value (addr) of the variable to be
1034 /// registered
1035 LLVM_ABI void registerTargetGlobalVariable(
1036 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
1037 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
1038 bool IsDeclaration, bool IsExternallyVisible,
1039 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1040 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1041 std::vector<Triple> TargetTriple,
1042 std::function<Constant *()> GlobalInitializer,
1043 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
1044 Type *LlvmPtrTy, Constant *Addr);
1045
1046 /// Get the offset of the OMP_MAP_MEMBER_OF field.
1047 LLVM_ABI unsigned getFlagMemberOffset();
1048
1049 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
1050 /// the position given.
1051 /// \param Position - A value indicating the position of the parent
1052 /// of the member in the kernel argument structure, often retrieved
1053 /// by the parents position in the combined information vectors used
1054 /// to generate the structure itself. Multiple children (member's of)
1055 /// with the same parent will use the same returned member flag.
1056 LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
1057
1058 /// Given an initial flag set, this function modifies it to contain
1059 /// the passed in MemberOfFlag generated from the getMemberOfFlag
1060 /// function. The results are dependent on the existing flag bits
1061 /// set in the original flag set.
1062 /// \param Flags - The original set of flags to be modified with the
1063 /// passed in MemberOfFlag.
1064 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
1065 /// slightly based on the getMemberOfFlag which adjusts the flag bits
1066 /// based on the members position in its parent.
1067 LLVM_ABI void
1068 setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
1069 omp::OpenMPOffloadMappingFlags MemberOfFlag);
1070
1071private:
1072 /// Modifies the canonical loop to be a statically-scheduled workshare loop
1073 /// which is executed on the device
1074 ///
1075 /// This takes a \p CLI representing a canonical loop, such as the one
1076 /// created by \see createCanonicalLoop and emits additional instructions to
1077 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1078 /// runtime function in the preheader to call OpenMP device rtl function
1079 /// which handles worksharing of loop body interations.
1080 ///
1081 /// \param DL Debug location for instructions added for the
1082 /// workshare-loop construct itself.
1083 /// \param CLI A descriptor of the canonical loop to workshare.
1084 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1085 /// preheader of the loop.
1086 /// \param LoopType Information about type of loop worksharing.
1087 /// It corresponds to type of loop workshare OpenMP pragma.
1088 ///
1089 /// \returns Point where to insert code after the workshare construct.
1090 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
1091 InsertPointTy AllocaIP,
1092 omp::WorksharingLoopType LoopType);
1093
1094 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
1095 ///
1096 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1097 /// created by \p createCanonicalLoop and emits additional instructions to
1098 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1099 /// runtime function in the preheader to obtain the loop bounds to be used in
1100 /// the current thread, updates the relevant instructions in the canonical
1101 /// loop and calls to an OpenMP runtime finalization function after the loop.
1102 ///
1103 /// \param DL Debug location for instructions added for the
1104 /// workshare-loop construct itself.
1105 /// \param CLI A descriptor of the canonical loop to workshare.
1106 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1107 /// preheader of the loop.
1108 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1109 /// the loop.
1110 /// \param LoopType Type of workshare loop.
1111 ///
1112 /// \returns Point where to insert code after the workshare construct.
1113 InsertPointOrErrorTy applyStaticWorkshareLoop(
1114 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1115 omp::WorksharingLoopType LoopType, bool NeedsBarrier);
1116
1117 /// Modifies the canonical loop a statically-scheduled workshare loop with a
1118 /// user-specified chunk size.
1119 ///
1120 /// \param DL Debug location for instructions added for the
1121 /// workshare-loop construct itself.
1122 /// \param CLI A descriptor of the canonical loop to workshare.
1123 /// \param AllocaIP An insertion point for Alloca instructions usable in
1124 /// the preheader of the loop.
1125 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1126 /// loop.
1127 /// \param ChunkSize The user-specified chunk size.
1128 ///
1129 /// \returns Point where to insert code after the workshare construct.
1130 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
1131 CanonicalLoopInfo *CLI,
1132 InsertPointTy AllocaIP,
1133 bool NeedsBarrier,
1134 Value *ChunkSize);
1135
1136 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1137 ///
1138 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1139 /// created by \p createCanonicalLoop and emits additional instructions to
1140 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1141 /// runtime function in the preheader to obtain, and then in each iteration
1142 /// to update the loop counter.
1143 ///
1144 /// \param DL Debug location for instructions added for the
1145 /// workshare-loop construct itself.
1146 /// \param CLI A descriptor of the canonical loop to workshare.
1147 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1148 /// preheader of the loop.
1149 /// \param SchedType Type of scheduling to be passed to the init function.
1150 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1151 /// the loop.
1152 /// \param Chunk The size of loop chunk considered as a unit when
1153 /// scheduling. If \p nullptr, defaults to 1.
1154 ///
1155 /// \returns Point where to insert code after the workshare construct.
1156 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1157 CanonicalLoopInfo *CLI,
1158 InsertPointTy AllocaIP,
1159 omp::OMPScheduleType SchedType,
1160 bool NeedsBarrier,
1161 Value *Chunk = nullptr);
1162
1163 /// Create alternative version of the loop to support if clause
1164 ///
1165 /// OpenMP if clause can require to generate second loop. This loop
1166 /// will be executed when if clause condition is not met. createIfVersion
1167 /// adds branch instruction to the copied loop if \p ifCond is not met.
1168 ///
1169 /// \param Loop Original loop which should be versioned.
1170 /// \param IfCond Value which corresponds to if clause condition
1171 /// \param VMap Value to value map to define relation between
1172 /// original and copied loop values and loop blocks.
1173 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1174 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1175 ValueMap<const Value *, WeakTrackingVH> &VMap,
1176 LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
1177 const Twine &NamePrefix = "");
1178
1179public:
1180 /// Modifies the canonical loop to be a workshare loop.
1181 ///
1182 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1183 /// created by \p createCanonicalLoop and emits additional instructions to
1184 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1185 /// runtime function in the preheader to obtain the loop bounds to be used in
1186 /// the current thread, updates the relevant instructions in the canonical
1187 /// loop and calls to an OpenMP runtime finalization function after the loop.
1188 ///
1189 /// The concrete transformation is done by applyStaticWorkshareLoop,
1190 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1191 /// on the value of \p SchedKind and \p ChunkSize.
1192 ///
1193 /// \param DL Debug location for instructions added for the
1194 /// workshare-loop construct itself.
1195 /// \param CLI A descriptor of the canonical loop to workshare.
1196 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1197 /// preheader of the loop.
1198 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1199 /// the loop.
1200 /// \param SchedKind Scheduling algorithm to use.
1201 /// \param ChunkSize The chunk size for the inner loop.
1202 /// \param HasSimdModifier Whether the simd modifier is present in the
1203 /// schedule clause.
1204 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1205 /// the schedule clause.
1206 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1207 /// present in the schedule clause.
1208 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1209 /// present.
1210 /// \param LoopType Information about type of loop worksharing.
1211 /// It corresponds to type of loop workshare OpenMP pragma.
1212 ///
1213 /// \returns Point where to insert code after the workshare construct.
1214 LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
1215 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1216 bool NeedsBarrier,
1217 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1218 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1219 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1220 bool HasOrderedClause = false,
1221 omp::WorksharingLoopType LoopType =
1222 omp::WorksharingLoopType::ForStaticLoop);
1223
1224 /// Tile a loop nest.
1225 ///
1226 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1227 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1228 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1229 /// of every loop and every tile sizes must be usable in the outermost
1230 /// loop's preheader. This implies that the loop nest is rectangular.
1231 ///
1232 /// Example:
1233 /// \code
1234 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1235 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1236 /// body(i, j);
1237 /// \endcode
1238 ///
1239 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1240 /// \code
1241 /// for (int i1 = 0; i1 < 3; ++i1)
1242 /// for (int j1 = 0; j1 < 2; ++j1)
1243 /// for (int i2 = 0; i2 < 5; ++i2)
1244 /// for (int j2 = 0; j2 < 7; ++j2)
1245 /// body(i1*3+i2, j1*3+j2);
1246 /// \endcode
1247 ///
1248 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1249 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1250 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1251 /// that are not multiples of the tile size. In the latter case the tile loop
1252 /// of the last floor-loop iteration will have fewer iterations than specified
1253 /// as its tile size.
1254 ///
1255 ///
1256 /// @param DL Debug location for instructions added by tiling, for
1257 /// instance the floor- and tile trip count computation.
1258 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1259 /// invalidated by this method, i.e. should not used after
1260 /// tiling.
1261 /// @param TileSizes For each loop in \p Loops, the tile size for that
1262 /// dimensions.
1263 ///
1264 /// \returns A list of generated loops. Contains twice as many loops as the
1265 /// input loop nest; the first half are the floor loops and the
1266 /// second half are the tile loops.
1267 LLVM_ABI std::vector<CanonicalLoopInfo *>
1268 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1269 ArrayRef<Value *> TileSizes);
1270
1271 /// Fully unroll a loop.
1272 ///
1273 /// Instead of unrolling the loop immediately (and duplicating its body
1274 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1275 /// metadata.
1276 ///
1277 /// \param DL Debug location for instructions added by unrolling.
1278 /// \param Loop The loop to unroll. The loop will be invalidated.
1279 LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1280
1281 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1282 /// using LLVM's LoopUnrollPass.
1283 ///
1284 /// \param DL Debug location for instructions added by unrolling.
1285 /// \param Loop The loop to unroll. The loop will be invalidated.
1286 LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1287
1288 /// Partially unroll a loop.
1289 ///
1290 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1291 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1292 /// needing the CanonicalLoopInfo allows more efficient code generation by
1293 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1294 /// A loop-associated directive applied to the unrolled loop needs to know the
1295 /// new trip count which means that if using a heuristically determined unroll
1296 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1297 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1298 /// but which assumes that some canonicalization has taken place (e.g.
1299 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1300 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1301 ///
1302 /// \param DL Debug location for instructions added by unrolling.
1303 /// \param Loop The loop to unroll. The loop will be invalidated.
1304 /// \param Factor The factor to unroll the loop by. A factor of 0
1305 /// indicates that a heuristic should be used to determine
1306 /// the unroll-factor.
1307 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1308 /// partially unrolled loop. Otherwise, uses loop metadata
1309 /// to defer unrolling to the LoopUnrollPass.
1310 LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
1311 int32_t Factor,
1312 CanonicalLoopInfo **UnrolledCLI);
1313
1314 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1315 /// is cloned. The metadata which prevents vectorization is added to
1316 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1317 /// to false.
1318 ///
1319 /// \param Loop The loop to simd-ize.
1320 /// \param AlignedVars The map which containts pairs of the pointer
1321 /// and its corresponding alignment.
1322 /// \param IfCond The value which corresponds to the if clause
1323 /// condition.
1324 /// \param Order The enum to map order clause.
1325 /// \param Simdlen The Simdlen length to apply to the simd loop.
1326 /// \param Safelen The Safelen length to apply to the simd loop.
1327 LLVM_ABI void applySimd(CanonicalLoopInfo *Loop,
1328 MapVector<Value *, Value *> AlignedVars,
1329 Value *IfCond, omp::OrderKind Order,
1330 ConstantInt *Simdlen, ConstantInt *Safelen);
1331
1332 /// Generator for '#omp flush'
1333 ///
1334 /// \param Loc The location where the flush directive was encountered
1335 LLVM_ABI void createFlush(const LocationDescription &Loc);
1336
1337 /// Generator for '#omp taskwait'
1338 ///
1339 /// \param Loc The location where the taskwait directive was encountered.
1340 LLVM_ABI void createTaskwait(const LocationDescription &Loc);
1341
1342 /// Generator for '#omp taskyield'
1343 ///
1344 /// \param Loc The location where the taskyield directive was encountered.
1345 LLVM_ABI void createTaskyield(const LocationDescription &Loc);
1346
1347 /// A struct to pack the relevant information for an OpenMP depend clause.
1348 struct DependData {
1349 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1350 Type *DepValueType;
1351 Value *DepVal;
1352 explicit DependData() = default;
1353 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1354 Value *DepVal)
1355 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1356 };
1357
1358 /// Generator for `#omp task`
1359 ///
1360 /// \param Loc The location where the task construct was encountered.
1361 /// \param AllocaIP The insertion point to be used for alloca instructions.
1362 /// \param BodyGenCB Callback that will generate the region code.
1363 /// \param Tied True if the task is tied, false if the task is untied.
1364 /// \param Final i1 value which is `true` if the task is final, `false` if the
1365 /// task is not final.
1366 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1367 /// task is generated, and the encountering thread must
1368 /// suspend the current task region, for which execution
1369 /// cannot be resumed until execution of the structured
1370 /// block that is associated with the generated task is
1371 /// completed.
1372 /// \param EventHandle If present, signifies the event handle as part of
1373 /// the detach clause
1374 /// \param Mergeable If the given task is `mergeable`
1375 /// \param priority `priority-value' specifies the execution order of the
1376 /// tasks that is generated by the construct
1377 LLVM_ABI InsertPointOrErrorTy
1378 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1379 BodyGenCallbackTy BodyGenCB, bool Tied = true,
1380 Value *Final = nullptr, Value *IfCondition = nullptr,
1381 SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1382 Value *EventHandle = nullptr, Value *Priority = nullptr);
1383
1384 /// Generator for the taskgroup construct
1385 ///
1386 /// \param Loc The location where the taskgroup construct was encountered.
1387 /// \param AllocaIP The insertion point to be used for alloca instructions.
1388 /// \param BodyGenCB Callback that will generate the region code.
1389 LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1390 InsertPointTy AllocaIP,
1391 BodyGenCallbackTy BodyGenCB);
1392
1393 using FileIdentifierInfoCallbackTy =
1394 std::function<std::tuple<std::string, uint64_t>()>;
1395
1396 /// Creates a unique info for a target entry when provided a filename and
1397 /// line number from.
1398 ///
1399 /// \param CallBack A callback function which should return filename the entry
1400 /// resides in as well as the line number for the target entry
1401 /// \param ParentName The name of the parent the target entry resides in, if
1402 /// any.
1403 LLVM_ABI static TargetRegionEntryInfo
1404 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1405 StringRef ParentName = "");
1406
1407 /// Enum class for the RedctionGen CallBack type to be used.
1408 enum class ReductionGenCBKind { Clang, MLIR };
1409
1410 /// ReductionGen CallBack for Clang
1411 ///
1412 /// \param CodeGenIP InsertPoint for CodeGen.
1413 /// \param Index Index of the ReductionInfo to generate code for.
1414 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1415 /// codegen, used for fixup later.
1416 /// \param RHSPtr Optionally used by Clang to
1417 /// return the RHSPtr it used for codegen, used for fixup later.
1418 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1419 /// Clang context may be old.
1420 using ReductionGenClangCBTy =
1421 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1422 Value **LHS, Value **RHS, Function *CurFn)>;
1423
1424 /// ReductionGen CallBack for MLIR
1425 ///
1426 /// \param CodeGenIP InsertPoint for CodeGen.
1427 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1428 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1429 using ReductionGenCBTy = std::function<InsertPointOrErrorTy(
1430 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1431
1432 /// Functions used to generate atomic reductions. Such functions take two
1433 /// Values representing pointers to LHS and RHS of the reduction, as well as
1434 /// the element type of these pointers. They are expected to atomically
1435 /// update the LHS to the reduced value.
1436 using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy(
1437 InsertPointTy, Type *, Value *, Value *)>;
1438
1439 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1440 enum class EvalKind { Scalar, Complex, Aggregate };
1441
1442 /// Information about an OpenMP reduction.
1443 struct ReductionInfo {
1444 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1445 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1446 ReductionGenClangCBTy ReductionGenClang,
1447 ReductionGenAtomicCBTy AtomicReductionGen)
1449 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1450 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1451 AtomicReductionGen(AtomicReductionGen) {}
1452 ReductionInfo(Value *PrivateVariable)
1453 : ElementType(nullptr), Variable(nullptr),
1454 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1455 ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
1456
1457 /// Reduction element type, must match pointee type of variable.
1459
1460 /// Reduction variable of pointer type.
1461 Value *Variable;
1462
1463 /// Thread-private partial reduction variable.
1464 Value *PrivateVariable;
1465
1466 /// Reduction evaluation kind - scalar, complex or aggregate.
1467 EvalKind EvaluationKind;
1468
1469 /// Callback for generating the reduction body. The IR produced by this will
1470 /// be used to combine two values in a thread-safe context, e.g., under
1471 /// lock or within the same thread, and therefore need not be atomic.
1472 ReductionGenCBTy ReductionGen;
1473
1474 /// Clang callback for generating the reduction body. The IR produced by
1475 /// this will be used to combine two values in a thread-safe context, e.g.,
1476 /// under lock or within the same thread, and therefore need not be atomic.
1477 ReductionGenClangCBTy ReductionGenClang;
1478
1479 /// Callback for generating the atomic reduction body, may be null. The IR
1480 /// produced by this will be used to atomically combine two values during
1481 /// reduction. If null, the implementation will use the non-atomic version
1482 /// along with the appropriate synchronization mechanisms.
1483 ReductionGenAtomicCBTy AtomicReductionGen;
1484 };
1485
1486 enum class CopyAction : unsigned {
1487 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1488 // the warp using shuffle instructions.
1489 RemoteLaneToThread,
1490 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1491 ThreadCopy,
1492 };
1493
1494 struct CopyOptionsTy {
1495 Value *RemoteLaneOffset = nullptr;
1496 Value *ScratchpadIndex = nullptr;
1497 Value *ScratchpadWidth = nullptr;
1498 };
1499
1500 /// Supporting functions for Reductions CodeGen.
1501private:
1502 /// Get the id of the current thread on the GPU.
1503 Value *getGPUThreadID();
1504
1505 /// Get the GPU warp size.
1506 Value *getGPUWarpSize();
1507
1508 /// Get the id of the warp in the block.
1509 /// We assume that the warp size is 32, which is always the case
1510 /// on the NVPTX device, to generate more efficient code.
1511 Value *getNVPTXWarpID();
1512
1513 /// Get the id of the current lane in the Warp.
1514 /// We assume that the warp size is 32, which is always the case
1515 /// on the NVPTX device, to generate more efficient code.
1516 Value *getNVPTXLaneID();
1517
1518 /// Cast value to the specified type.
1519 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1520
1521 /// This function creates calls to one of two shuffle functions to copy
1522 /// variables between lanes in a warp.
1523 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1524 Type *ElementType, Value *Offset);
1525
1526 /// Function to shuffle over the value from the remote lane.
1527 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1528 Type *ElementType, Value *Offset,
1529 Type *ReductionArrayTy);
1530
1531 /// Emit instructions to copy a Reduce list, which contains partially
1532 /// aggregated values, in the specified direction.
1533 void emitReductionListCopy(
1534 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1535 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1536 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1537
1538 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1539 /// in the same warp. It uses shuffle instructions to copy over data from
1540 /// a remote lane's stack. The reduction algorithm performed is specified
1541 /// by the fourth parameter.
1542 ///
1543 /// Algorithm Versions.
1544 /// Full Warp Reduce (argument value 0):
1545 /// This algorithm assumes that all 32 lanes are active and gathers
1546 /// data from these 32 lanes, producing a single resultant value.
1547 /// Contiguous Partial Warp Reduce (argument value 1):
1548 /// This algorithm assumes that only a *contiguous* subset of lanes
1549 /// are active. This happens for the last warp in a parallel region
1550 /// when the user specified num_threads is not an integer multiple of
1551 /// 32. This contiguous subset always starts with the zeroth lane.
1552 /// Partial Warp Reduce (argument value 2):
1553 /// This algorithm gathers data from any number of lanes at any position.
1554 /// All reduced values are stored in the lowest possible lane. The set
1555 /// of problems every algorithm addresses is a super set of those
1556 /// addressable by algorithms with a lower version number. Overhead
1557 /// increases as algorithm version increases.
1558 ///
1559 /// Terminology
1560 /// Reduce element:
1561 /// Reduce element refers to the individual data field with primitive
1562 /// data types to be combined and reduced across threads.
1563 /// Reduce list:
1564 /// Reduce list refers to a collection of local, thread-private
1565 /// reduce elements.
1566 /// Remote Reduce list:
1567 /// Remote Reduce list refers to a collection of remote (relative to
1568 /// the current thread) reduce elements.
1569 ///
1570 /// We distinguish between three states of threads that are important to
1571 /// the implementation of this function.
1572 /// Alive threads:
1573 /// Threads in a warp executing the SIMT instruction, as distinguished from
1574 /// threads that are inactive due to divergent control flow.
1575 /// Active threads:
1576 /// The minimal set of threads that has to be alive upon entry to this
1577 /// function. The computation is correct iff active threads are alive.
1578 /// Some threads are alive but they are not active because they do not
1579 /// contribute to the computation in any useful manner. Turning them off
1580 /// may introduce control flow overheads without any tangible benefits.
1581 /// Effective threads:
1582 /// In order to comply with the argument requirements of the shuffle
1583 /// function, we must keep all lanes holding data alive. But at most
1584 /// half of them perform value aggregation; we refer to this half of
1585 /// threads as effective. The other half is simply handing off their
1586 /// data.
1587 ///
1588 /// Procedure
1589 /// Value shuffle:
1590 /// In this step active threads transfer data from higher lane positions
1591 /// in the warp to lower lane positions, creating Remote Reduce list.
1592 /// Value aggregation:
1593 /// In this step, effective threads combine their thread local Reduce list
1594 /// with Remote Reduce list and store the result in the thread local
1595 /// Reduce list.
1596 /// Value copy:
1597 /// In this step, we deal with the assumption made by algorithm 2
1598 /// (i.e. contiguity assumption). When we have an odd number of lanes
1599 /// active, say 2k+1, only k threads will be effective and therefore k
1600 /// new values will be produced. However, the Reduce list owned by the
1601 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1602 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1603 /// that the contiguity assumption still holds.
1604 ///
1605 /// \param ReductionInfos Array type containing the ReductionOps.
1606 /// \param ReduceFn The reduction function.
1607 /// \param FuncAttrs Optional param to specify any function attributes that
1608 /// need to be copied to the new function.
1609 ///
1610 /// \return The ShuffleAndReduce function.
1611 Function *emitShuffleAndReduceFunction(
1612 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1613 Function *ReduceFn, AttributeList FuncAttrs);
1614
1615 /// Helper function for CreateCanonicalScanLoops to create InputLoop
1616 /// in the firstGen and Scan Loop in the SecondGen
1617 /// \param InputLoopGen Callback for generating the loop for input phase
1618 /// \param ScanLoopGen Callback for generating the loop for scan phase
1619 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1620 /// `ScanInfoInitialize`.
1621 ///
1622 /// \return error if any produced, else return success.
1623 Error emitScanBasedDirectiveIR(
1624 llvm::function_ref<Error()> InputLoopGen,
1625 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
1626 ScanInfo *ScanRedInfo);
1627
1628 /// Creates the basic blocks required for scan reduction.
1629 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1630 /// `ScanInfoInitialize`.
1631 void createScanBBs(ScanInfo *ScanRedInfo);
1632
1633 /// Dynamically allocates the buffer needed for scan reduction.
1634 /// \param AllocaIP The IP where possibly-shared pointer of buffer needs to
1635 /// be declared.
1636 /// \param ScanVars Scan Variables.
1637 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1638 /// `ScanInfoInitialize`.
1639 ///
1640 /// \return error if any produced, else return success.
1641 Error emitScanBasedDirectiveDeclsIR(InsertPointTy AllocaIP,
1642 ArrayRef<llvm::Value *> ScanVars,
1643 ArrayRef<llvm::Type *> ScanVarsType,
1644 ScanInfo *ScanRedInfo);
1645
1646 /// Copies the result back to the reduction variable.
1647 /// \param ReductionInfos Array type containing the ReductionOps.
1648 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1649 /// `ScanInfoInitialize`.
1650 ///
1651 /// \return error if any produced, else return success.
1652 Error emitScanBasedDirectiveFinalsIR(
1653 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1654 ScanInfo *ScanInfo);
1655
1656 /// This function emits a helper that gathers Reduce lists from the first
1657 /// lane of every active warp to lanes in the first warp.
1658 ///
1659 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1660 /// shared smem[warp_size];
1661 /// For all data entries D in reduce_data:
1662 /// sync
1663 /// If (I am the first lane in each warp)
1664 /// Copy my local D to smem[warp_id]
1665 /// sync
1666 /// if (I am the first warp)
1667 /// Copy smem[thread_id] to my local D
1668 ///
1669 /// \param Loc The insert and source location description.
1670 /// \param ReductionInfos Array type containing the ReductionOps.
1671 /// \param FuncAttrs Optional param to specify any function attributes that
1672 /// need to be copied to the new function.
1673 ///
1674 /// \return The InterWarpCopy function.
1675 Expected<Function *>
1676 emitInterWarpCopyFunction(const LocationDescription &Loc,
1677 ArrayRef<ReductionInfo> ReductionInfos,
1678 AttributeList FuncAttrs);
1679
1680 /// This function emits a helper that copies all the reduction variables from
1681 /// the team into the provided global buffer for the reduction variables.
1682 ///
1683 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1684 /// For all data entries D in reduce_data:
1685 /// Copy local D to buffer.D[Idx]
1686 ///
1687 /// \param ReductionInfos Array type containing the ReductionOps.
1688 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1689 /// \param FuncAttrs Optional param to specify any function attributes that
1690 /// need to be copied to the new function.
1691 ///
1692 /// \return The ListToGlobalCopy function.
1693 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1694 Type *ReductionsBufferTy,
1695 AttributeList FuncAttrs);
1696
1697 /// This function emits a helper that copies all the reduction variables from
1698 /// the team into the provided global buffer for the reduction variables.
1699 ///
1700 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1701 /// For all data entries D in reduce_data:
1702 /// Copy buffer.D[Idx] to local D;
1703 ///
1704 /// \param ReductionInfos Array type containing the ReductionOps.
1705 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1706 /// \param FuncAttrs Optional param to specify any function attributes that
1707 /// need to be copied to the new function.
1708 ///
1709 /// \return The GlobalToList function.
1710 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1711 Type *ReductionsBufferTy,
1712 AttributeList FuncAttrs);
1713
1714 /// This function emits a helper that reduces all the reduction variables from
1715 /// the team into the provided global buffer for the reduction variables.
1716 ///
1717 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1718 /// void *GlobPtrs[];
1719 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1720 /// ...
1721 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1722 /// reduce_function(GlobPtrs, reduce_data);
1723 ///
1724 /// \param ReductionInfos Array type containing the ReductionOps.
1725 /// \param ReduceFn The reduction function.
1726 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1727 /// \param FuncAttrs Optional param to specify any function attributes that
1728 /// need to be copied to the new function.
1729 ///
1730 /// \return The ListToGlobalReduce function.
1731 Function *
1732 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1733 Function *ReduceFn, Type *ReductionsBufferTy,
1734 AttributeList FuncAttrs);
1735
1736 /// This function emits a helper that reduces all the reduction variables from
1737 /// the team into the provided global buffer for the reduction variables.
1738 ///
1739 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1740 /// void *GlobPtrs[];
1741 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1742 /// ...
1743 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1744 /// reduce_function(reduce_data, GlobPtrs);
1745 ///
1746 /// \param ReductionInfos Array type containing the ReductionOps.
1747 /// \param ReduceFn The reduction function.
1748 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1749 /// \param FuncAttrs Optional param to specify any function attributes that
1750 /// need to be copied to the new function.
1751 ///
1752 /// \return The GlobalToListReduce function.
1753 Function *
1754 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1755 Function *ReduceFn, Type *ReductionsBufferTy,
1756 AttributeList FuncAttrs);
1757
1758 /// Get the function name of a reduction function.
1759 std::string getReductionFuncName(StringRef Name) const;
1760
1761 /// Emits reduction function.
1762 /// \param ReducerName Name of the function calling the reduction.
1763 /// \param ReductionInfos Array type containing the ReductionOps.
1764 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1765 /// CodeGenCB kind.
1766 /// \param FuncAttrs Optional param to specify any function attributes that
1767 /// need to be copied to the new function.
1768 ///
1769 /// \return The reduction function.
1770 Expected<Function *> createReductionFunction(
1771 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1772 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1773 AttributeList FuncAttrs = {});
1774
1775public:
1776 ///
1777 /// Design of OpenMP reductions on the GPU
1778 ///
1779 /// Consider a typical OpenMP program with one or more reduction
1780 /// clauses:
1781 ///
1782 /// float foo;
1783 /// double bar;
1784 /// #pragma omp target teams distribute parallel for \
1785 /// reduction(+:foo) reduction(*:bar)
1786 /// for (int i = 0; i < N; i++) {
1787 /// foo += A[i]; bar *= B[i];
1788 /// }
1789 ///
1790 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1791 /// all teams. In our OpenMP implementation on the NVPTX device an
1792 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1793 /// within a team are mapped to CUDA threads within a threadblock.
1794 /// Our goal is to efficiently aggregate values across all OpenMP
1795 /// threads such that:
1796 ///
1797 /// - the compiler and runtime are logically concise, and
1798 /// - the reduction is performed efficiently in a hierarchical
1799 /// manner as follows: within OpenMP threads in the same warp,
1800 /// across warps in a threadblock, and finally across teams on
1801 /// the NVPTX device.
1802 ///
1803 /// Introduction to Decoupling
1804 ///
1805 /// We would like to decouple the compiler and the runtime so that the
1806 /// latter is ignorant of the reduction variables (number, data types)
1807 /// and the reduction operators. This allows a simpler interface
1808 /// and implementation while still attaining good performance.
1809 ///
1810 /// Pseudocode for the aforementioned OpenMP program generated by the
1811 /// compiler is as follows:
1812 ///
1813 /// 1. Create private copies of reduction variables on each OpenMP
1814 /// thread: 'foo_private', 'bar_private'
1815 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1816 /// to it and writes the result in 'foo_private' and 'bar_private'
1817 /// respectively.
1818 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1819 /// and store the result on the team master:
1820 ///
1821 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1822 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1823 ///
1824 /// where:
1825 /// struct ReduceData {
1826 /// double *foo;
1827 /// double *bar;
1828 /// } reduceData
1829 /// reduceData.foo = &foo_private
1830 /// reduceData.bar = &bar_private
1831 ///
1832 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1833 /// auxiliary functions generated by the compiler that operate on
1834 /// variables of type 'ReduceData'. They aid the runtime perform
1835 /// algorithmic steps in a data agnostic manner.
1836 ///
1837 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1838 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1839 /// same warp. It takes the following arguments as input:
1840 ///
1841 /// a. variable of type 'ReduceData' on the calling lane,
1842 /// b. its lane_id,
1843 /// c. an offset relative to the current lane_id to generate a
1844 /// remote_lane_id. The remote lane contains the second
1845 /// variable of type 'ReduceData' that is to be reduced.
1846 /// d. an algorithm version parameter determining which reduction
1847 /// algorithm to use.
1848 ///
1849 /// 'shuffleReduceFn' retrieves data from the remote lane using
1850 /// efficient GPU shuffle intrinsics and reduces, using the
1851 /// algorithm specified by the 4th parameter, the two operands
1852 /// element-wise. The result is written to the first operand.
1853 ///
1854 /// Different reduction algorithms are implemented in different
1855 /// runtime functions, all calling 'shuffleReduceFn' to perform
1856 /// the essential reduction step. Therefore, based on the 4th
1857 /// parameter, this function behaves slightly differently to
1858 /// cooperate with the runtime to ensure correctness under
1859 /// different circumstances.
1860 ///
1861 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1862 /// reduced variables across warps. It tunnels, through CUDA
1863 /// shared memory, the thread-private data of type 'ReduceData'
1864 /// from lane 0 of each warp to a lane in the first warp.
1865 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1866 /// The last team writes the global reduced value to memory.
1867 ///
1868 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1869 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1870 /// scratchpadCopyFn, loadAndReduceFn)
1871 ///
1872 /// 'scratchpadCopyFn' is a helper that stores reduced
1873 /// data from the team master to a scratchpad array in
1874 /// global memory.
1875 ///
1876 /// 'loadAndReduceFn' is a helper that loads data from
1877 /// the scratchpad array and reduces it with the input
1878 /// operand.
1879 ///
1880 /// These compiler generated functions hide address
1881 /// calculation and alignment information from the runtime.
1882 /// 5. if ret == 1:
1883 /// The team master of the last team stores the reduced
1884 /// result to the globals in memory.
1885 /// foo += reduceData.foo; bar *= reduceData.bar
1886 ///
1887 ///
1888 /// Warp Reduction Algorithms
1889 ///
1890 /// On the warp level, we have three algorithms implemented in the
1891 /// OpenMP runtime depending on the number of active lanes:
1892 ///
1893 /// Full Warp Reduction
1894 ///
1895 /// The reduce algorithm within a warp where all lanes are active
1896 /// is implemented in the runtime as follows:
1897 ///
1898 /// full_warp_reduce(void *reduce_data,
1899 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1900 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1901 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1902 /// }
1903 ///
1904 /// The algorithm completes in log(2, WARPSIZE) steps.
1905 ///
1906 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1907 /// not used therefore we save instructions by not retrieving lane_id
1908 /// from the corresponding special registers. The 4th parameter, which
1909 /// represents the version of the algorithm being used, is set to 0 to
1910 /// signify full warp reduction.
1911 ///
1912 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1913 ///
1914 /// #reduce_elem refers to an element in the local lane's data structure
1915 /// #remote_elem is retrieved from a remote lane
1916 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1917 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1918 ///
1919 /// Contiguous Partial Warp Reduction
1920 ///
1921 /// This reduce algorithm is used within a warp where only the first
1922 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1923 /// number of OpenMP threads in a parallel region is not a multiple of
1924 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1925 ///
1926 /// void
1927 /// contiguous_partial_reduce(void *reduce_data,
1928 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1929 /// int size, int lane_id) {
1930 /// int curr_size;
1931 /// int offset;
1932 /// curr_size = size;
1933 /// mask = curr_size/2;
1934 /// while (offset>0) {
1935 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1936 /// curr_size = (curr_size+1)/2;
1937 /// offset = curr_size/2;
1938 /// }
1939 /// }
1940 ///
1941 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1942 ///
1943 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1944 /// if (lane_id < offset)
1945 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1946 /// else
1947 /// reduce_elem = remote_elem
1948 ///
1949 /// This algorithm assumes that the data to be reduced are located in a
1950 /// contiguous subset of lanes starting from the first. When there is
1951 /// an odd number of active lanes, the data in the last lane is not
1952 /// aggregated with any other lane's dat but is instead copied over.
1953 ///
1954 /// Dispersed Partial Warp Reduction
1955 ///
1956 /// This algorithm is used within a warp when any discontiguous subset of
1957 /// lanes are active. It is used to implement the reduction operation
1958 /// across lanes in an OpenMP simd region or in a nested parallel region.
1959 ///
1960 /// void
1961 /// dispersed_partial_reduce(void *reduce_data,
1962 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1963 /// int size, remote_id;
1964 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1965 /// do {
1966 /// remote_id = next_active_lane_id_right_after_me();
1967 /// # the above function returns 0 of no active lane
1968 /// # is present right after the current lane.
1969 /// size = number_of_active_lanes_in_this_warp();
1970 /// logical_lane_id /= 2;
1971 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1972 /// remote_id-1-threadIdx.x, 2);
1973 /// } while (logical_lane_id % 2 == 0 && size > 1);
1974 /// }
1975 ///
1976 /// There is no assumption made about the initial state of the reduction.
1977 /// Any number of lanes (>=1) could be active at any position. The reduction
1978 /// result is returned in the first active lane.
1979 ///
1980 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1981 ///
1982 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1983 /// if (lane_id % 2 == 0 && offset > 0)
1984 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1985 /// else
1986 /// reduce_elem = remote_elem
1987 ///
1988 ///
1989 /// Intra-Team Reduction
1990 ///
1991 /// This function, as implemented in the runtime call
1992 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1993 /// threads in a team. It first reduces within a warp using the
1994 /// aforementioned algorithms. We then proceed to gather all such
1995 /// reduced values at the first warp.
1996 ///
1997 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1998 /// data from each of the "warp master" (zeroth lane of each warp, where
1999 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2000 /// a mathematical sense) the problem of reduction across warp masters in
2001 /// a block to the problem of warp reduction.
2002 ///
2003 ///
2004 /// Inter-Team Reduction
2005 ///
2006 /// Once a team has reduced its data to a single value, it is stored in
2007 /// a global scratchpad array. Since each team has a distinct slot, this
2008 /// can be done without locking.
2009 ///
2010 /// The last team to write to the scratchpad array proceeds to reduce the
2011 /// scratchpad array. One or more workers in the last team use the helper
2012 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2013 /// the k'th worker reduces every k'th element.
2014 ///
2015 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2016 /// reduce across workers and compute a globally reduced value.
2017 ///
2018 /// \param Loc The location where the reduction was
2019 /// encountered. Must be within the associate
2020 /// directive and after the last local access to the
2021 /// reduction variables.
2022 /// \param AllocaIP An insertion point suitable for allocas usable
2023 /// in reductions.
2024 /// \param CodeGenIP An insertion point suitable for code
2025 /// generation. \param ReductionInfos A list of info on each reduction
2026 /// variable. \param IsNoWait Optional flag set if the reduction is
2027 /// marked as
2028 /// nowait.
2029 /// \param IsTeamsReduction Optional flag set if it is a teams
2030 /// reduction.
2031 /// \param GridValue Optional GPU grid value.
2032 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
2033 /// used for teams reduction.
2034 /// \param SrcLocInfo Source location information global.
2035 LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
2036 const LocationDescription &Loc, InsertPointTy AllocaIP,
2037 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
2038 bool IsNoWait = false, bool IsTeamsReduction = false,
2039 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
2040 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
2041 Value *SrcLocInfo = nullptr);
2042
2043 // TODO: provide atomic and non-atomic reduction generators for reduction
2044 // operators defined by the OpenMP specification.
2045
2046 /// Generator for '#omp reduction'.
2047 ///
2048 /// Emits the IR instructing the runtime to perform the specific kind of
2049 /// reductions. Expects reduction variables to have been privatized and
2050 /// initialized to reduction-neutral values separately. Emits the calls to
2051 /// runtime functions as well as the reduction function and the basic blocks
2052 /// performing the reduction atomically and non-atomically.
2053 ///
2054 /// The code emitted for the following:
2055 ///
2056 /// \code
2057 /// type var_1;
2058 /// type var_2;
2059 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
2060 /// /* body */;
2061 /// \endcode
2062 ///
2063 /// corresponds to the following sketch.
2064 ///
2065 /// \code
2066 /// void _outlined_par() {
2067 /// // N is the number of different reductions.
2068 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
2069 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
2070 /// _omp_reduction_func,
2071 /// _gomp_critical_user.reduction.var)) {
2072 /// case 1: {
2073 /// var_1 = var_1 <reduction-op> privatized_var_1;
2074 /// var_2 = var_2 <reduction-op> privatized_var_2;
2075 /// // ...
2076 /// __kmpc_end_reduce(...);
2077 /// break;
2078 /// }
2079 /// case 2: {
2080 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
2081 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
2082 /// // ...
2083 /// break;
2084 /// }
2085 /// default: break;
2086 /// }
2087 /// }
2088 ///
2089 /// void _omp_reduction_func(void **lhs, void **rhs) {
2090 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
2091 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
2092 /// // ...
2093 /// }
2094 /// \endcode
2095 ///
2096 /// \param Loc The location where the reduction was
2097 /// encountered. Must be within the associate
2098 /// directive and after the last local access to the
2099 /// reduction variables.
2100 /// \param AllocaIP An insertion point suitable for allocas usable
2101 /// in reductions.
2102 /// \param ReductionInfos A list of info on each reduction variable.
2103 /// \param IsNoWait A flag set if the reduction is marked as nowait.
2104 /// \param IsByRef A flag set if the reduction is using reference
2105 /// or direct value.
2106 /// \param IsTeamsReduction Optional flag set if it is a teams
2107 /// reduction.
2108 LLVM_ABI InsertPointOrErrorTy createReductions(
2109 const LocationDescription &Loc, InsertPointTy AllocaIP,
2110 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
2111 bool IsNoWait = false, bool IsTeamsReduction = false);
2112
2113 ///}
2114
2115 /// Return the insertion point used by the underlying IRBuilder.
2116 InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
2117
2118 /// Update the internal location to \p Loc.
2119 bool updateToLocation(const LocationDescription &Loc) {
2120 Builder.restoreIP(Loc.IP);
2121 Builder.SetCurrentDebugLocation(Loc.DL);
2122 return Loc.IP.getBlock() != nullptr;
2123 }
2124
2125 /// Return the function declaration for the runtime function with \p FnID.
2126 LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M,
2127 omp::RuntimeFunction FnID);
2128
2129 LLVM_ABI Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
2130
2131 /// Return the (LLVM-IR) string describing the source location \p LocStr.
2132 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef LocStr,
2133 uint32_t &SrcLocStrSize);
2134
2135 /// Return the (LLVM-IR) string describing the default source location.
2136 LLVM_ABI Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
2137
2138 /// Return the (LLVM-IR) string describing the source location identified by
2139 /// the arguments.
2140 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef FunctionName,
2141 StringRef FileName, unsigned Line,
2142 unsigned Column,
2143 uint32_t &SrcLocStrSize);
2144
2145 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
2146 /// fallback if \p DL does not specify the function name.
2147 LLVM_ABI Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
2148 Function *F = nullptr);
2149
2150 /// Return the (LLVM-IR) string describing the source location \p Loc.
2151 LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2152 uint32_t &SrcLocStrSize);
2153
2154 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2155 /// TODO: Create a enum class for the Reserve2Flags
2156 LLVM_ABI Constant *getOrCreateIdent(Constant *SrcLocStr,
2157 uint32_t SrcLocStrSize,
2158 omp::IdentFlag Flags = omp::IdentFlag(0),
2159 unsigned Reserve2Flags = 0);
2160
2161 /// Create a hidden global flag \p Name in the module with initial value \p
2162 /// Value.
2163 LLVM_ABI GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
2164
2165 /// Emit the llvm.used metadata.
2166 LLVM_ABI void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
2167
2168 /// Emit the kernel execution mode.
2169 LLVM_ABI GlobalVariable *
2170 emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode);
2171
2172 /// Generate control flow and cleanup for cancellation.
2173 ///
2174 /// \param CancelFlag Flag indicating if the cancellation is performed.
2175 /// \param CanceledDirective The kind of directive that is cancled.
2176 /// \param ExitCB Extra code to be generated in the exit block.
2177 ///
2178 /// \return an error, if any were triggered during execution.
2179 LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
2180 omp::Directive CanceledDirective,
2181 FinalizeCallbackTy ExitCB = {});
2182
2183 /// Generate a target region entry call.
2184 ///
2185 /// \param Loc The location at which the request originated and is fulfilled.
2186 /// \param AllocaIP The insertion point to be used for alloca instructions.
2187 /// \param Return Return value of the created function returned by reference.
2188 /// \param DeviceID Identifier for the device via the 'device' clause.
2189 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2190 /// or 0 if unspecified and -1 if there is no 'teams' clause.
2191 /// \param NumThreads Number of threads via the 'thread_limit' clause.
2192 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2193 /// \param KernelArgs Array of arguments to the kernel.
2194 LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2195 InsertPointTy AllocaIP,
2196 Value *&Return, Value *Ident,
2197 Value *DeviceID, Value *NumTeams,
2198 Value *NumThreads, Value *HostPtr,
2199 ArrayRef<Value *> KernelArgs);
2200
2201 /// Generate a flush runtime call.
2202 ///
2203 /// \param Loc The location at which the request originated and is fulfilled.
2204 LLVM_ABI void emitFlush(const LocationDescription &Loc);
2205
2206 /// The finalization stack made up of finalize callbacks currently in-flight,
2207 /// wrapped into FinalizationInfo objects that reference also the finalization
2208 /// target block and the kind of cancellable directive.
2209 SmallVector<FinalizationInfo, 8> FinalizationStack;
2210
2211 /// Return true if the last entry in the finalization stack is of kind \p DK
2212 /// and cancellable.
2213 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2214 return !FinalizationStack.empty() &&
2215 FinalizationStack.back().IsCancellable &&
2216 FinalizationStack.back().DK == DK;
2217 }
2218
2219 /// Generate a taskwait runtime call.
2220 ///
2221 /// \param Loc The location at which the request originated and is fulfilled.
2222 LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc);
2223
2224 /// Generate a taskyield runtime call.
2225 ///
2226 /// \param Loc The location at which the request originated and is fulfilled.
2227 LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc);
2228
2229 /// Return the current thread ID.
2230 ///
2231 /// \param Ident The ident (ident_t*) describing the query origin.
2232 LLVM_ABI Value *getOrCreateThreadID(Value *Ident);
2233
2234 /// The OpenMPIRBuilder Configuration
2235 OpenMPIRBuilderConfig Config;
2236
2237 /// The underlying LLVM-IR module
2238 Module &M;
2239
2240 /// The LLVM-IR Builder used to create IR.
2241 IRBuilder<> Builder;
2242
2243 /// Map to remember source location strings
2244 StringMap<Constant *> SrcLocStrMap;
2245
2246 /// Map to remember existing ident_t*.
2247 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2248
2249 /// Info manager to keep track of target regions.
2250 OffloadEntriesInfoManager OffloadInfoManager;
2251
2252 /// The target triple of the underlying module.
2253 const Triple T;
2254
2255 /// Helper that contains information about regions we need to outline
2256 /// during finalization.
2257 struct OutlineInfo {
2258 using PostOutlineCBTy = std::function<void(Function &)>;
2259 PostOutlineCBTy PostOutlineCB;
2260 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2261 SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2262
2263 /// Collect all blocks in between EntryBB and ExitBB in both the given
2264 /// vector and set.
2265 LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2266 SmallVectorImpl<BasicBlock *> &BlockVector);
2267
2268 /// Return the function that contains the region to be outlined.
2269 Function *getFunction() const { return EntryBB->getParent(); }
2270 };
2271
2272 /// Collection of regions that need to be outlined during finalization.
2273 SmallVector<OutlineInfo, 16> OutlineInfos;
2274
2275 /// A collection of candidate target functions that's constant allocas will
2276 /// attempt to be raised on a call of finalize after all currently enqueued
2277 /// outline info's have been processed.
2278 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2279
2280 /// Collection of owned canonical loop objects that eventually need to be
2281 /// free'd.
2282 std::forward_list<CanonicalLoopInfo> LoopInfos;
2283
2284 /// Collection of owned ScanInfo objects that eventually need to be free'd.
2285 std::forward_list<ScanInfo> ScanInfos;
2286
2287 /// Add a new region that will be outlined later.
2288 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2289
2290 /// An ordered map of auto-generated variables to their unique names.
2291 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2292 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2293 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2294 /// variables.
2295 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2296
2297 /// Computes the size of type in bytes.
2299
2300 // Emit a branch from the current block to the Target block only if
2301 // the current block has a terminator.
2302 LLVM_ABI void emitBranch(BasicBlock *Target);
2303
2304 // If BB has no use then delete it and return. Else place BB after the current
2305 // block, if possible, or else at the end of the function. Also add a branch
2306 // from current block to BB if current block does not have a terminator.
2307 LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn,
2308 bool IsFinished = false);
2309
2310 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2311 /// Here is the logic:
2312 /// if (Cond) {
2313 /// ThenGen();
2314 /// } else {
2315 /// ElseGen();
2316 /// }
2317 ///
2318 /// \return an error, if any were triggered during execution.
2319 LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2320 BodyGenCallbackTy ElseGen,
2321 InsertPointTy AllocaIP = {});
2322
2323 /// Create the global variable holding the offload mappings information.
2324 LLVM_ABI GlobalVariable *
2325 createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2326 std::string VarName);
2327
2328 /// Create the global variable holding the offload names information.
2329 LLVM_ABI GlobalVariable *
2330 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2331 std::string VarName);
2332
2333 struct MapperAllocas {
2334 AllocaInst *ArgsBase = nullptr;
2335 AllocaInst *Args = nullptr;
2336 AllocaInst *ArgSizes = nullptr;
2337 };
2338
2339 /// Create the allocas instruction used in call to mapper functions.
2340 LLVM_ABI void createMapperAllocas(const LocationDescription &Loc,
2341 InsertPointTy AllocaIP,
2342 unsigned NumOperands,
2343 struct MapperAllocas &MapperAllocas);
2344
2345 /// Create the call for the target mapper function.
2346 /// \param Loc The source location description.
2347 /// \param MapperFunc Function to be called.
2348 /// \param SrcLocInfo Source location information global.
2349 /// \param MaptypesArg The argument types.
2350 /// \param MapnamesArg The argument names.
2351 /// \param MapperAllocas The AllocaInst used for the call.
2352 /// \param DeviceID Device ID for the call.
2353 /// \param NumOperands Number of operands in the call.
2354 LLVM_ABI void emitMapperCall(const LocationDescription &Loc,
2355 Function *MapperFunc, Value *SrcLocInfo,
2356 Value *MaptypesArg, Value *MapnamesArg,
2357 struct MapperAllocas &MapperAllocas,
2358 int64_t DeviceID, unsigned NumOperands);
2359
2360 /// Container for the arguments used to pass data to the runtime library.
2361 struct TargetDataRTArgs {
2362 /// The array of base pointer passed to the runtime library.
2363 Value *BasePointersArray = nullptr;
2364 /// The array of section pointers passed to the runtime library.
2365 Value *PointersArray = nullptr;
2366 /// The array of sizes passed to the runtime library.
2367 Value *SizesArray = nullptr;
2368 /// The array of map types passed to the runtime library for the beginning
2369 /// of the region or for the entire region if there are no separate map
2370 /// types for the region end.
2371 Value *MapTypesArray = nullptr;
2372 /// The array of map types passed to the runtime library for the end of the
2373 /// region, or nullptr if there are no separate map types for the region
2374 /// end.
2375 Value *MapTypesArrayEnd = nullptr;
2376 /// The array of user-defined mappers passed to the runtime library.
2377 Value *MappersArray = nullptr;
2378 /// The array of original declaration names of mapped pointers sent to the
2379 /// runtime library for debugging
2380 Value *MapNamesArray = nullptr;
2381
2382 explicit TargetDataRTArgs() {}
2383 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2384 Value *SizesArray, Value *MapTypesArray,
2385 Value *MapTypesArrayEnd, Value *MappersArray,
2386 Value *MapNamesArray)
2387 : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2388 SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2389 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2390 MapNamesArray(MapNamesArray) {}
2391 };
2392
2393 /// Container to pass the default attributes with which a kernel must be
2394 /// launched, used to set kernel attributes and populate associated static
2395 /// structures.
2396 ///
2397 /// For max values, < 0 means unset, == 0 means set but unknown at compile
2398 /// time. The number of max values will be 1 except for the case where
2399 /// ompx_bare is set.
2400 struct TargetKernelDefaultAttrs {
2401 omp::OMPTgtExecModeFlags ExecFlags =
2402 omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2403 SmallVector<int32_t, 3> MaxTeams = {-1};
2404 int32_t MinTeams = 1;
2405 SmallVector<int32_t, 3> MaxThreads = {-1};
2406 int32_t MinThreads = 1;
2407 int32_t ReductionDataSize = 0;
2408 int32_t ReductionBufferLength = 0;
2409 };
2410
2411 /// Container to pass LLVM IR runtime values or constants related to the
2412 /// number of teams and threads with which the kernel must be launched, as
2413 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2414 /// kernel. These must be defined in the host prior to the call to the kernel
2415 /// launch OpenMP RTL function.
2416 struct TargetKernelRuntimeAttrs {
2417 SmallVector<Value *, 3> MaxTeams = {nullptr};
2418 Value *MinTeams = nullptr;
2419 SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
2420 SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
2421
2422 /// 'parallel' construct 'num_threads' clause value, if present and it is an
2423 /// SPMD kernel.
2424 Value *MaxThreads = nullptr;
2425
2426 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2427 /// it is a generic kernel.
2428 Value *LoopTripCount = nullptr;
2429 };
2430
2431 /// Data structure that contains the needed information to construct the
2432 /// kernel args vector.
2433 struct TargetKernelArgs {
2434 /// Number of arguments passed to the runtime library.
2435 unsigned NumTargetItems = 0;
2436 /// Arguments passed to the runtime library
2437 TargetDataRTArgs RTArgs;
2438 /// The number of iterations
2439 Value *NumIterations = nullptr;
2440 /// The number of teams.
2441 ArrayRef<Value *> NumTeams;
2442 /// The number of threads.
2443 ArrayRef<Value *> NumThreads;
2444 /// The size of the dynamic shared memory.
2445 Value *DynCGGroupMem = nullptr;
2446 /// True if the kernel has 'no wait' clause.
2447 bool HasNoWait = false;
2448
2449 // Constructors for TargetKernelArgs.
2450 TargetKernelArgs() {}
2451 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2452 Value *NumIterations, ArrayRef<Value *> NumTeams,
2453 ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
2454 bool HasNoWait)
2455 : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2456 NumIterations(NumIterations), NumTeams(NumTeams),
2457 NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
2458 HasNoWait(HasNoWait) {}
2459 };
2460
2461 /// Create the kernel args vector used by emitTargetKernel. This function
2462 /// creates various constant values that are used in the resulting args
2463 /// vector.
2464 LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2465 IRBuilderBase &Builder,
2466 SmallVector<Value *> &ArgsVector);
2467
2468 /// Struct that keeps the information that should be kept throughout
2469 /// a 'target data' region.
2470 class TargetDataInfo {
2471 /// Set to true if device pointer information have to be obtained.
2472 bool RequiresDevicePointerInfo = false;
2473 /// Set to true if Clang emits separate runtime calls for the beginning and
2474 /// end of the region. These calls might have separate map type arrays.
2475 bool SeparateBeginEndCalls = false;
2476
2477 public:
2478 TargetDataRTArgs RTArgs;
2479
2480 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2481 DevicePtrInfoMap;
2482
2483 /// Indicate whether any user-defined mapper exists.
2484 bool HasMapper = false;
2485 /// The total number of pointers passed to the runtime library.
2486 unsigned NumberOfPtrs = 0u;
2487
2488 bool EmitDebug = false;
2489
2490 /// Whether the `target ... data` directive has a `nowait` clause.
2491 bool HasNoWait = false;
2492
2493 explicit TargetDataInfo() {}
2494 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2495 bool SeparateBeginEndCalls)
2496 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2497 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2498 /// Clear information about the data arrays.
2499 void clearArrayInfo() {
2500 RTArgs = TargetDataRTArgs();
2501 HasMapper = false;
2502 NumberOfPtrs = 0u;
2503 }
2504 /// Return true if the current target data information has valid arrays.
2505 bool isValid() {
2506 return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2507 RTArgs.SizesArray && RTArgs.MapTypesArray &&
2508 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2509 }
2510 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2511 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2512 };
2513
2514 enum class DeviceInfoTy { None, Pointer, Address };
2515 using MapValuesArrayTy = SmallVector<Value *, 4>;
2516 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2517 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2518 using MapNamesArrayTy = SmallVector<Constant *, 4>;
2519 using MapDimArrayTy = SmallVector<uint64_t, 4>;
2520 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2521
2522 /// This structure contains combined information generated for mappable
2523 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2524 /// mappers, and non-contiguous information.
2525 struct MapInfosTy {
2526 struct StructNonContiguousInfo {
2527 bool IsNonContiguous = false;
2528 MapDimArrayTy Dims;
2529 MapNonContiguousArrayTy Offsets;
2530 MapNonContiguousArrayTy Counts;
2531 MapNonContiguousArrayTy Strides;
2532 };
2533 MapValuesArrayTy BasePointers;
2534 MapValuesArrayTy Pointers;
2535 MapDeviceInfoArrayTy DevicePointers;
2536 MapValuesArrayTy Sizes;
2537 MapFlagsArrayTy Types;
2538 MapNamesArrayTy Names;
2539 StructNonContiguousInfo NonContigInfo;
2540
2541 /// Append arrays in \a CurInfo.
2542 void append(MapInfosTy &CurInfo) {
2543 BasePointers.append(CurInfo.BasePointers.begin(),
2544 CurInfo.BasePointers.end());
2545 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2546 DevicePointers.append(CurInfo.DevicePointers.begin(),
2547 CurInfo.DevicePointers.end());
2548 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2549 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2550 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2551 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2552 CurInfo.NonContigInfo.Dims.end());
2553 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2554 CurInfo.NonContigInfo.Offsets.end());
2555 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2556 CurInfo.NonContigInfo.Counts.end());
2557 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2558 CurInfo.NonContigInfo.Strides.end());
2559 }
2560 };
2561 using MapInfosOrErrorTy = Expected<MapInfosTy &>;
2562
2563 /// Callback function type for functions emitting the host fallback code that
2564 /// is executed when the kernel launch fails. It takes an insertion point as
2565 /// parameter where the code should be emitted. It returns an insertion point
2566 /// that points right after after the emitted code.
2567 using EmitFallbackCallbackTy =
2568 function_ref<InsertPointOrErrorTy(InsertPointTy)>;
2569
2570 // Callback function type for emitting and fetching user defined custom
2571 // mappers.
2572 using CustomMapperCallbackTy =
2573 function_ref<Expected<Function *>(unsigned int)>;
2574
2575 /// Generate a target region entry call and host fallback call.
2576 ///
2577 /// \param Loc The location at which the request originated and is fulfilled.
2578 /// \param OutlinedFnID The ooulined function ID.
2579 /// \param EmitTargetCallFallbackCB Call back function to generate host
2580 /// fallback code.
2581 /// \param Args Data structure holding information about the kernel arguments.
2582 /// \param DeviceID Identifier for the device via the 'device' clause.
2583 /// \param RTLoc Source location identifier
2584 /// \param AllocaIP The insertion point to be used for alloca instructions.
2585 LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(
2586 const LocationDescription &Loc, Value *OutlinedFnID,
2587 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2588 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2589
2590 /// Callback type for generating the bodies of device directives that require
2591 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2592 ///
2593 /// \param DeviceID The ID of the device on which the target region will
2594 /// execute.
2595 /// \param RTLoc Source location identifier
2596 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2597 /// generated task.
2598 ///
2599 /// \return an error, if any were triggered during execution.
2600 using TargetTaskBodyCallbackTy =
2601 function_ref<Error(Value *DeviceID, Value *RTLoc,
2602 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2603
2604 /// Generate a target-task for the target construct
2605 ///
2606 /// \param TaskBodyCB Callback to generate the actual body of the target task.
2607 /// \param DeviceID Identifier for the device via the 'device' clause.
2608 /// \param RTLoc Source location identifier
2609 /// \param AllocaIP The insertion point to be used for alloca instructions.
2610 /// \param Dependencies Vector of DependData objects holding information of
2611 /// dependencies as specified by the 'depend' clause.
2612 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2613 /// otherwise
2614 LLVM_ABI InsertPointOrErrorTy emitTargetTask(
2615 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2616 OpenMPIRBuilder::InsertPointTy AllocaIP,
2617 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
2618 const TargetDataRTArgs &RTArgs, bool HasNoWait);
2619
2620 /// Emit the arguments to be passed to the runtime library based on the
2621 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2622 /// ForEndCall, emit map types to be passed for the end of the region instead
2623 /// of the beginning.
2624 LLVM_ABI void emitOffloadingArraysArgument(
2625 IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2626 OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false);
2627
2628 /// Emit an array of struct descriptors to be assigned to the offload args.
2629 LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2630 InsertPointTy CodeGenIP,
2631 MapInfosTy &CombinedInfo,
2632 TargetDataInfo &Info);
2633
2634 /// Emit the arrays used to pass the captures and map information to the
2635 /// offloading runtime library. If there is no map or capture information,
2636 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2637 /// that contains information generated for mappable clauses,
2638 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2639 LLVM_ABI Error emitOffloadingArrays(
2640 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2641 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
2642 bool IsNonContiguous = false,
2643 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2644
2645 /// Allocates memory for and populates the arrays required for offloading
2646 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2647 /// emits their base addresses as arguments to be passed to the runtime
2648 /// library. In essence, this function is a combination of
2649 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2650 /// be preferred by clients of OpenMPIRBuilder.
2651 LLVM_ABI Error emitOffloadingArraysAndArgs(
2652 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2653 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2654 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
2655 bool ForEndCall = false,
2656 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2657
2658 /// Creates offloading entry for the provided entry ID \a ID, address \a
2659 /// Addr, size \a Size, and flags \a Flags.
2660 LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2661 int32_t Flags, GlobalValue::LinkageTypes,
2662 StringRef Name = "");
2663
2664 /// The kind of errors that can occur when emitting the offload entries and
2665 /// metadata.
2666 enum EmitMetadataErrorKind {
2667 EMIT_MD_TARGET_REGION_ERROR,
2668 EMIT_MD_DECLARE_TARGET_ERROR,
2669 EMIT_MD_GLOBAL_VAR_LINK_ERROR
2670 };
2671
2672 /// Callback function type
2673 using EmitMetadataErrorReportFunctionTy =
2674 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2675
2676 // Emit the offloading entries and metadata so that the device codegen side
2677 // can easily figure out what to emit. The produced metadata looks like
2678 // this:
2679 //
2680 // !omp_offload.info = !{!1, ...}
2681 //
2682 // We only generate metadata for function that contain target regions.
2683 LLVM_ABI void createOffloadEntriesAndInfoMetadata(
2684 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2685
2686public:
2687 /// Generator for __kmpc_copyprivate
2688 ///
2689 /// \param Loc The source location description.
2690 /// \param BufSize Number of elements in the buffer.
2691 /// \param CpyBuf List of pointers to data to be copied.
2692 /// \param CpyFn function to call for copying data.
2693 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2694 ///
2695 /// \return The insertion position *after* the CopyPrivate call.
2696
2697 LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2698 llvm::Value *BufSize,
2699 llvm::Value *CpyBuf,
2700 llvm::Value *CpyFn,
2701 llvm::Value *DidIt);
2702
2703 /// Generator for '#omp single'
2704 ///
2705 /// \param Loc The source location description.
2706 /// \param BodyGenCB Callback that will generate the region code.
2707 /// \param FiniCB Callback to finalize variable copies.
2708 /// \param IsNowait If false, a barrier is emitted.
2709 /// \param CPVars copyprivate variables.
2710 /// \param CPFuncs copy functions to use for each copyprivate variable.
2711 ///
2712 /// \returns The insertion position *after* the single call.
2713 LLVM_ABI InsertPointOrErrorTy
2714 createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2715 FinalizeCallbackTy FiniCB, bool IsNowait,
2716 ArrayRef<llvm::Value *> CPVars = {},
2717 ArrayRef<llvm::Function *> CPFuncs = {});
2718
2719 /// Generator for '#omp master'
2720 ///
2721 /// \param Loc The insert and source location description.
2722 /// \param BodyGenCB Callback that will generate the region code.
2723 /// \param FiniCB Callback to finalize variable copies.
2724 ///
2725 /// \returns The insertion position *after* the master.
2726 LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2727 BodyGenCallbackTy BodyGenCB,
2728 FinalizeCallbackTy FiniCB);
2729
2730 /// Generator for '#omp masked'
2731 ///
2732 /// \param Loc The insert and source location description.
2733 /// \param BodyGenCB Callback that will generate the region code.
2734 /// \param FiniCB Callback to finialize variable copies.
2735 ///
2736 /// \returns The insertion position *after* the masked.
2737 LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2738 BodyGenCallbackTy BodyGenCB,
2739 FinalizeCallbackTy FiniCB,
2740 Value *Filter);
2741
2742 /// This function performs the scan reduction of the values updated in
2743 /// the input phase. The reduction logic needs to be emitted between input
2744 /// and scan loop returned by `CreateCanonicalScanLoops`. The following
2745 /// is the code that is generated, `buffer` and `span` are expected to be
2746 /// populated before executing the generated code.
2747 /// \code{c}
2748 /// for (int k = 0; k != ceil(log2(span)); ++k) {
2749 /// i=pow(2,k)
2750 /// for (size cnt = last_iter; cnt >= i; --cnt)
2751 /// buffer[cnt] op= buffer[cnt-i];
2752 /// }
2753 /// \endcode
2754 /// \param Loc The insert and source location description.
2755 /// \param ReductionInfos Array type containing the ReductionOps.
2756 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2757 /// `ScanInfoInitialize`.
2758 ///
2759 /// \returns The insertion position *after* the masked.
2760 LLVM_ABI InsertPointOrErrorTy emitScanReduction(
2761 const LocationDescription &Loc,
2762 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
2763 ScanInfo *ScanRedInfo);
2764
2765 /// This directive split and directs the control flow to input phase
2766 /// blocks or scan phase blocks based on 1. whether input loop or scan loop
2767 /// is executed, 2. whether exclusive or inclusive scan is used.
2768 ///
2769 /// \param Loc The insert and source location description.
2770 /// \param AllocaIP The IP where the temporary buffer for scan reduction
2771 // needs to be allocated.
2772 /// \param ScanVars Scan Variables.
2773 /// \param IsInclusive Whether it is an inclusive or exclusive scan.
2774 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2775 /// `ScanInfoInitialize`.
2776 ///
2777 /// \returns The insertion position *after* the scan.
2778 LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc,
2779 InsertPointTy AllocaIP,
2780 ArrayRef<llvm::Value *> ScanVars,
2781 ArrayRef<llvm::Type *> ScanVarsType,
2782 bool IsInclusive,
2783 ScanInfo *ScanRedInfo);
2784
2785 /// Generator for '#omp critical'
2786 ///
2787 /// \param Loc The insert and source location description.
2788 /// \param BodyGenCB Callback that will generate the region body code.
2789 /// \param FiniCB Callback to finalize variable copies.
2790 /// \param CriticalName name of the lock used by the critical directive
2791 /// \param HintInst Hint Instruction for hint clause associated with critical
2792 ///
2793 /// \returns The insertion position *after* the critical.
2794 LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2795 BodyGenCallbackTy BodyGenCB,
2796 FinalizeCallbackTy FiniCB,
2797 StringRef CriticalName,
2798 Value *HintInst);
2799
2800 /// Generator for '#omp ordered depend (source | sink)'
2801 ///
2802 /// \param Loc The insert and source location description.
2803 /// \param AllocaIP The insertion point to be used for alloca instructions.
2804 /// \param NumLoops The number of loops in depend clause.
2805 /// \param StoreValues The value will be stored in vector address.
2806 /// \param Name The name of alloca instruction.
2807 /// \param IsDependSource If true, depend source; otherwise, depend sink.
2808 ///
2809 /// \return The insertion position *after* the ordered.
2810 LLVM_ABI InsertPointTy
2811 createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP,
2812 unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues,
2813 const Twine &Name, bool IsDependSource);
2814
2815 /// Generator for '#omp ordered [threads | simd]'
2816 ///
2817 /// \param Loc The insert and source location description.
2818 /// \param BodyGenCB Callback that will generate the region code.
2819 /// \param FiniCB Callback to finalize variable copies.
2820 /// \param IsThreads If true, with threads clause or without clause;
2821 /// otherwise, with simd clause;
2822 ///
2823 /// \returns The insertion position *after* the ordered.
2824 LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(
2825 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2826 FinalizeCallbackTy FiniCB, bool IsThreads);
2827
2828 /// Generator for '#omp sections'
2829 ///
2830 /// \param Loc The insert and source location description.
2831 /// \param AllocaIP The insertion points to be used for alloca instructions.
2832 /// \param SectionCBs Callbacks that will generate body of each section.
2833 /// \param PrivCB Callback to copy a given variable (think copy constructor).
2834 /// \param FiniCB Callback to finalize variable copies.
2835 /// \param IsCancellable Flag to indicate a cancellable parallel region.
2836 /// \param IsNowait If true, barrier - to ensure all sections are executed
2837 /// before moving forward will not be generated.
2838 /// \returns The insertion position *after* the sections.
2839 LLVM_ABI InsertPointOrErrorTy
2840 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
2841 ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2842 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
2843 bool IsCancellable, bool IsNowait);
2844
2845 /// Generator for '#omp section'
2846 ///
2847 /// \param Loc The insert and source location description.
2848 /// \param BodyGenCB Callback that will generate the region body code.
2849 /// \param FiniCB Callback to finalize variable copies.
2850 /// \returns The insertion position *after* the section.
2851 LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc,
2852 BodyGenCallbackTy BodyGenCB,
2853 FinalizeCallbackTy FiniCB);
2854
2855 /// Generator for `#omp teams`
2856 ///
2857 /// \param Loc The location where the teams construct was encountered.
2858 /// \param BodyGenCB Callback that will generate the region code.
2859 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2860 /// it is as if lower bound is specified as equal to upperbound. If
2861 /// this is non-null, then upperbound must also be non-null.
2862 /// \param NumTeamsUpper Upper bound on the number of teams.
2863 /// \param ThreadLimit on the number of threads that may participate in a
2864 /// contention group created by each team.
2865 /// \param IfExpr is the integer argument value of the if condition on the
2866 /// teams clause.
2867 LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc,
2868 BodyGenCallbackTy BodyGenCB,
2869 Value *NumTeamsLower = nullptr,
2870 Value *NumTeamsUpper = nullptr,
2871 Value *ThreadLimit = nullptr,
2872 Value *IfExpr = nullptr);
2873
2874 /// Generator for `#omp distribute`
2875 ///
2876 /// \param Loc The location where the distribute construct was encountered.
2877 /// \param AllocaIP The insertion points to be used for alloca instructions.
2878 /// \param BodyGenCB Callback that will generate the region code.
2879 LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
2880 InsertPointTy AllocaIP,
2881 BodyGenCallbackTy BodyGenCB);
2882
2883 /// Generate conditional branch and relevant BasicBlocks through which private
2884 /// threads copy the 'copyin' variables from Master copy to threadprivate
2885 /// copies.
2886 ///
2887 /// \param IP insertion block for copyin conditional
2888 /// \param MasterVarPtr a pointer to the master variable
2889 /// \param PrivateVarPtr a pointer to the threadprivate variable
2890 /// \param IntPtrTy Pointer size type
2891 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2892 // and copy.in.end block
2893 ///
2894 /// \returns The insertion point where copying operation to be emitted.
2895 LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP,
2896 Value *MasterAddr,
2897 Value *PrivateAddr,
2898 llvm::IntegerType *IntPtrTy,
2899 bool BranchtoEnd = true);
2900
2901 /// Create a runtime call for kmpc_Alloc
2902 ///
2903 /// \param Loc The insert and source location description.
2904 /// \param Size Size of allocated memory space
2905 /// \param Allocator Allocator information instruction
2906 /// \param Name Name of call Instruction for OMP_alloc
2907 ///
2908 /// \returns CallInst to the OMP_Alloc call
2909 LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2910 Value *Allocator, std::string Name = "");
2911
2912 /// Create a runtime call for kmpc_free
2913 ///
2914 /// \param Loc The insert and source location description.
2915 /// \param Addr Address of memory space to be freed
2916 /// \param Allocator Allocator information instruction
2917 /// \param Name Name of call Instruction for OMP_Free
2918 ///
2919 /// \returns CallInst to the OMP_Free call
2920 LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2921 Value *Allocator, std::string Name = "");
2922
2923 /// Create a runtime call for kmpc_threadprivate_cached
2924 ///
2925 /// \param Loc The insert and source location description.
2926 /// \param Pointer pointer to data to be cached
2927 /// \param Size size of data to be cached
2928 /// \param Name Name of call Instruction for callinst
2929 ///
2930 /// \returns CallInst to the thread private cache call.
2931 LLVM_ABI CallInst *
2932 createCachedThreadPrivate(const LocationDescription &Loc,
2933 llvm::Value *Pointer, llvm::ConstantInt *Size,
2934 const llvm::Twine &Name = Twine(""));
2935
2936 /// Create a runtime call for __tgt_interop_init
2937 ///
2938 /// \param Loc The insert and source location description.
2939 /// \param InteropVar variable to be allocated
2940 /// \param InteropType type of interop operation
2941 /// \param Device devide to which offloading will occur
2942 /// \param NumDependences number of dependence variables
2943 /// \param DependenceAddress pointer to dependence variables
2944 /// \param HaveNowaitClause does nowait clause exist
2945 ///
2946 /// \returns CallInst to the __tgt_interop_init call
2947 LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc,
2948 Value *InteropVar,
2949 omp::OMPInteropType InteropType,
2950 Value *Device, Value *NumDependences,
2951 Value *DependenceAddress,
2952 bool HaveNowaitClause);
2953
2954 /// Create a runtime call for __tgt_interop_destroy
2955 ///
2956 /// \param Loc The insert and source location description.
2957 /// \param InteropVar variable to be allocated
2958 /// \param Device devide to which offloading will occur
2959 /// \param NumDependences number of dependence variables
2960 /// \param DependenceAddress pointer to dependence variables
2961 /// \param HaveNowaitClause does nowait clause exist
2962 ///
2963 /// \returns CallInst to the __tgt_interop_destroy call
2964 LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2965 Value *InteropVar, Value *Device,
2966 Value *NumDependences,
2967 Value *DependenceAddress,
2968 bool HaveNowaitClause);
2969
2970 /// Create a runtime call for __tgt_interop_use
2971 ///
2972 /// \param Loc The insert and source location description.
2973 /// \param InteropVar variable to be allocated
2974 /// \param Device devide to which offloading will occur
2975 /// \param NumDependences number of dependence variables
2976 /// \param DependenceAddress pointer to dependence variables
2977 /// \param HaveNowaitClause does nowait clause exist
2978 ///
2979 /// \returns CallInst to the __tgt_interop_use call
2980 LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc,
2981 Value *InteropVar, Value *Device,
2982 Value *NumDependences,
2983 Value *DependenceAddress,
2984 bool HaveNowaitClause);
2985
2986 /// The `omp target` interface
2987 ///
2988 /// For more information about the usage of this interface,
2989 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2990 ///
2991 ///{
2992
2993 /// Create a runtime call for kmpc_target_init
2994 ///
2995 /// \param Loc The insert and source location description.
2996 /// \param Attrs Structure containing the default attributes, including
2997 /// numbers of threads and teams to launch the kernel with.
2998 LLVM_ABI InsertPointTy createTargetInit(
2999 const LocationDescription &Loc,
3000 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
3001
3002 /// Create a runtime call for kmpc_target_deinit
3003 ///
3004 /// \param Loc The insert and source location description.
3005 /// \param TeamsReductionDataSize The maximal size of all the reduction data
3006 /// for teams reduction.
3007 /// \param TeamsReductionBufferLength The number of elements (each of up to
3008 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
3009 LLVM_ABI void createTargetDeinit(const LocationDescription &Loc,
3010 int32_t TeamsReductionDataSize = 0,
3011 int32_t TeamsReductionBufferLength = 1024);
3012
3013 ///}
3014
3015 /// Helpers to read/write kernel annotations from the IR.
3016 ///
3017 ///{
3018
3019 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
3020 /// is set.
3021 LLVM_ABI static std::pair<int32_t, int32_t>
3022 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
3023 LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T,
3024 Function &Kernel, int32_t LB,
3025 int32_t UB);
3026
3027 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
3028 /// is set.
3029 LLVM_ABI static std::pair<int32_t, int32_t>
3030 readTeamBoundsForKernel(const Triple &T, Function &Kernel);
3031 LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel,
3032 int32_t LB, int32_t UB);
3033 ///}
3034
3035private:
3036 // Sets the function attributes expected for the outlined function
3037 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
3038
3039 // Creates the function ID/Address for the given outlined function.
3040 // In the case of an embedded device function the address of the function is
3041 // used, in the case of a non-offload function a constant is created.
3042 Constant *createOutlinedFunctionID(Function *OutlinedFn,
3043 StringRef EntryFnIDName);
3044
3045 // Creates the region entry address for the outlined function
3046 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
3047 StringRef EntryFnName);
3048
3049public:
3050 /// Functions used to generate a function with the given name.
3051 using FunctionGenCallback =
3052 std::function<Expected<Function *>(StringRef FunctionName)>;
3053
3054 /// Create a unique name for the entry function using the source location
3055 /// information of the current target region. The name will be something like:
3056 ///
3057 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
3058 ///
3059 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
3060 /// mangled name of the function that encloses the target region and BB is the
3061 /// line number of the target region. CC is a count added when more than one
3062 /// region is located at the same location.
3063 ///
3064 /// If this target outline function is not an offload entry, we don't need to
3065 /// register it. This may happen if it is guarded by an if clause that is
3066 /// false at compile time, or no target archs have been specified.
3067 ///
3068 /// The created target region ID is used by the runtime library to identify
3069 /// the current target region, so it only has to be unique and not
3070 /// necessarily point to anything. It could be the pointer to the outlined
3071 /// function that implements the target region, but we aren't using that so
3072 /// that the compiler doesn't need to keep that, and could therefore inline
3073 /// the host function if proven worthwhile during optimization. In the other
3074 /// hand, if emitting code for the device, the ID has to be the function
3075 /// address so that it can retrieved from the offloading entry and launched
3076 /// by the runtime library. We also mark the outlined function to have
3077 /// external linkage in case we are emitting code for the device, because
3078 /// these functions will be entry points to the device.
3079 ///
3080 /// \param InfoManager The info manager keeping track of the offload entries
3081 /// \param EntryInfo The entry information about the function
3082 /// \param GenerateFunctionCallback The callback function to generate the code
3083 /// \param OutlinedFunction Pointer to the outlined function
3084 /// \param EntryFnIDName Name of the ID o be created
3085 LLVM_ABI Error emitTargetRegionFunction(
3086 TargetRegionEntryInfo &EntryInfo,
3087 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
3088 Function *&OutlinedFn, Constant *&OutlinedFnID);
3089
3090 /// Registers the given function and sets up the attribtues of the function
3091 /// Returns the FunctionID.
3092 ///
3093 /// \param InfoManager The info manager keeping track of the offload entries
3094 /// \param EntryInfo The entry information about the function
3095 /// \param OutlinedFunction Pointer to the outlined function
3096 /// \param EntryFnName Name of the outlined function
3097 /// \param EntryFnIDName Name of the ID o be created
3099 registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
3100 Function *OutlinedFunction,
3101 StringRef EntryFnName, StringRef EntryFnIDName);
3102
3103 /// Type of BodyGen to use for region codegen
3104 ///
3105 /// Priv: If device pointer privatization is required, emit the body of the
3106 /// region here. It will have to be duplicated: with and without
3107 /// privatization.
3108 /// DupNoPriv: If we need device pointer privatization, we need
3109 /// to emit the body of the region with no privatization in the 'else' branch
3110 /// of the conditional.
3111 /// NoPriv: If we don't require privatization of device
3112 /// pointers, we emit the body in between the runtime calls. This avoids
3113 /// duplicating the body code.
3114 enum BodyGenTy { Priv, DupNoPriv, NoPriv };
3115
3116 /// Callback type for creating the map infos for the kernel parameters.
3117 /// \param CodeGenIP is the insertion point where code should be generated,
3118 /// if any.
3119 using GenMapInfoCallbackTy =
3120 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
3121
3122private:
3123 /// Emit the array initialization or deletion portion for user-defined mapper
3124 /// code generation. First, it evaluates whether an array section is mapped
3125 /// and whether the \a MapType instructs to delete this section. If \a IsInit
3126 /// is true, and \a MapType indicates to not delete this array, array
3127 /// initialization code is generated. If \a IsInit is false, and \a MapType
3128 /// indicates to delete this array, array deletion code is generated.
3129 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
3130 llvm::Value *Base, llvm::Value *Begin,
3131 llvm::Value *Size, llvm::Value *MapType,
3132 llvm::Value *MapName, TypeSize ElementSize,
3133 llvm::BasicBlock *ExitBB, bool IsInit);
3134
3135public:
3136 /// Emit the user-defined mapper function. The code generation follows the
3137 /// pattern in the example below.
3138 /// \code
3139 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
3140 /// void *base, void *begin,
3141 /// int64_t size, int64_t type,
3142 /// void *name = nullptr) {
3143 /// // Allocate space for an array section first or add a base/begin for
3144 /// // pointer dereference.
3145 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
3146 /// !maptype.IsDelete)
3147 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3148 /// size*sizeof(Ty), clearToFromMember(type));
3149 /// // Map members.
3150 /// for (unsigned i = 0; i < size; i++) {
3151 /// // For each component specified by this mapper:
3152 /// for (auto c : begin[i]->all_components) {
3153 /// if (c.hasMapper())
3154 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
3155 /// c.arg_size,
3156 /// c.arg_type, c.arg_name);
3157 /// else
3158 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
3159 /// c.arg_begin, c.arg_size, c.arg_type,
3160 /// c.arg_name);
3161 /// }
3162 /// }
3163 /// // Delete the array section.
3164 /// if (size > 1 && maptype.IsDelete)
3165 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3166 /// size*sizeof(Ty), clearToFromMember(type));
3167 /// }
3168 /// \endcode
3169 ///
3170 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
3171 /// MapInfos and returns.
3172 /// \param ElemTy DeclareMapper element type.
3173 /// \param FuncName Optional param to specify mapper function name.
3174 /// \param CustomMapperCB Optional callback to generate code related to
3175 /// custom mappers.
3176 LLVM_ABI Expected<Function *> emitUserDefinedMapper(
3177 function_ref<MapInfosOrErrorTy(
3178 InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
3179 PrivAndGenMapInfoCB,
3180 llvm::Type *ElemTy, StringRef FuncName,
3181 CustomMapperCallbackTy CustomMapperCB);
3182
3183 /// Generator for '#omp target data'
3184 ///
3185 /// \param Loc The location where the target data construct was encountered.
3186 /// \param AllocaIP The insertion points to be used for alloca instructions.
3187 /// \param CodeGenIP The insertion point at which the target directive code
3188 /// should be placed.
3189 /// \param IsBegin If true then emits begin mapper call otherwise emits
3190 /// end mapper call.
3191 /// \param DeviceID Stores the DeviceID from the device clause.
3192 /// \param IfCond Value which corresponds to the if clause condition.
3193 /// \param Info Stores all information realted to the Target Data directive.
3194 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
3195 /// \param CustomMapperCB Callback to generate code related to
3196 /// custom mappers.
3197 /// \param BodyGenCB Optional Callback to generate the region code.
3198 /// \param DeviceAddrCB Optional callback to generate code related to
3199 /// use_device_ptr and use_device_addr.
3200 LLVM_ABI InsertPointOrErrorTy createTargetData(
3201 const LocationDescription &Loc, InsertPointTy AllocaIP,
3202 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
3203 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
3204 CustomMapperCallbackTy CustomMapperCB,
3205 omp::RuntimeFunction *MapperFunc = nullptr,
3206 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
3207 BodyGenTy BodyGenType)>
3208 BodyGenCB = nullptr,
3209 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
3210 Value *SrcLocInfo = nullptr);
3211
3212 using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
3213 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
3214
3215 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
3216 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
3217 InsertPointTy CodeGenIP)>;
3218
3219 /// Generator for '#omp target'
3220 ///
3221 /// \param Loc where the target data construct was encountered.
3222 /// \param IsOffloadEntry whether it is an offload entry.
3223 /// \param CodeGenIP The insertion point where the call to the outlined
3224 /// function should be emitted.
3225 /// \param Info Stores all information realted to the Target directive.
3226 /// \param EntryInfo The entry information about the function.
3227 /// \param DefaultAttrs Structure containing the default attributes, including
3228 /// numbers of threads and teams to launch the kernel with.
3229 /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3230 /// and teams to launch the kernel with.
3231 /// \param IfCond value of the `if` clause.
3232 /// \param Inputs The input values to the region that will be passed.
3233 /// as arguments to the outlined function.
3234 /// \param BodyGenCB Callback that will generate the region code.
3235 /// \param ArgAccessorFuncCB Callback that will generate accessors
3236 /// instructions for passed in target arguments where neccessary
3237 /// \param CustomMapperCB Callback to generate code related to
3238 /// custom mappers.
3239 /// \param Dependencies A vector of DependData objects that carry
3240 /// dependency information as passed in the depend clause
3241 /// \param HasNowait Whether the target construct has a `nowait` clause or
3242 /// not.
3243 LLVM_ABI InsertPointOrErrorTy createTarget(
3244 const LocationDescription &Loc, bool IsOffloadEntry,
3245 OpenMPIRBuilder::InsertPointTy AllocaIP,
3246 OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
3247 TargetRegionEntryInfo &EntryInfo,
3248 const TargetKernelDefaultAttrs &DefaultAttrs,
3249 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3250 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3251 TargetBodyGenCallbackTy BodyGenCB,
3252 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3253 CustomMapperCallbackTy CustomMapperCB,
3254 const SmallVector<DependData> &Dependencies, bool HasNowait = false);
3255
3256 /// Returns __kmpc_for_static_init_* runtime function for the specified
3257 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3258 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3259 LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize,
3260 bool IVSigned,
3261 bool IsGPUDistribute);
3262
3263 /// Returns __kmpc_dispatch_init_* runtime function for the specified
3264 /// size \a IVSize and sign \a IVSigned.
3265 LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize,
3266 bool IVSigned);
3267
3268 /// Returns __kmpc_dispatch_next_* runtime function for the specified
3269 /// size \a IVSize and sign \a IVSigned.
3270 LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize,
3271 bool IVSigned);
3272
3273 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3274 /// size \a IVSize and sign \a IVSigned.
3275 LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize,
3276 bool IVSigned);
3277
3278 /// Returns __kmpc_dispatch_deinit runtime function.
3279 LLVM_ABI FunctionCallee createDispatchDeinitFunction();
3280
3281 /// Declarations for LLVM-IR types (simple, array, function and structure) are
3282 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3283 /// we provide the declarations, the initializeTypes function will provide the
3284 /// values.
3285 ///
3286 ///{
3287#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3288#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
3289 ArrayType *VarName##Ty = nullptr; \
3290 PointerType *VarName##PtrTy = nullptr;
3291#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
3292 FunctionType *VarName = nullptr; \
3293 PointerType *VarName##Ptr = nullptr;
3294#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
3295 StructType *VarName = nullptr; \
3296 PointerType *VarName##Ptr = nullptr;
3297#include "llvm/Frontend/OpenMP/OMPKinds.def"
3298
3299 ///}
3300
3301private:
3302 /// Create all simple and struct types exposed by the runtime and remember
3303 /// the llvm::PointerTypes of them for easy access later.
3304 void initializeTypes(Module &M);
3305
3306 /// Common interface for generating entry calls for OMP Directives.
3307 /// if the directive has a region/body, It will set the insertion
3308 /// point to the body
3309 ///
3310 /// \param OMPD Directive to generate entry blocks for
3311 /// \param EntryCall Call to the entry OMP Runtime Function
3312 /// \param ExitBB block where the region ends.
3313 /// \param Conditional indicate if the entry call result will be used
3314 /// to evaluate a conditional of whether a thread will execute
3315 /// body code or not.
3316 ///
3317 /// \return The insertion position in exit block
3318 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3319 BasicBlock *ExitBB,
3320 bool Conditional = false);
3321
3322 /// Common interface to finalize the region
3323 ///
3324 /// \param OMPD Directive to generate exiting code for
3325 /// \param FinIP Insertion point for emitting Finalization code and exit call
3326 /// \param ExitCall Call to the ending OMP Runtime Function
3327 /// \param HasFinalize indicate if the directive will require finalization
3328 /// and has a finalization callback in the stack that
3329 /// should be called.
3330 ///
3331 /// \return The insertion position in exit block
3332 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3333 InsertPointTy FinIP,
3334 Instruction *ExitCall,
3335 bool HasFinalize = true);
3336
3337 /// Common Interface to generate OMP inlined regions
3338 ///
3339 /// \param OMPD Directive to generate inlined region for
3340 /// \param EntryCall Call to the entry OMP Runtime Function
3341 /// \param ExitCall Call to the ending OMP Runtime Function
3342 /// \param BodyGenCB Body code generation callback.
3343 /// \param FiniCB Finalization Callback. Will be called when finalizing region
3344 /// \param Conditional indicate if the entry call result will be used
3345 /// to evaluate a conditional of whether a thread will execute
3346 /// body code or not.
3347 /// \param HasFinalize indicate if the directive will require finalization
3348 /// and has a finalization callback in the stack that
3349 /// should be called.
3350 /// \param IsCancellable if HasFinalize is set to true, indicate if the
3351 /// the directive should be cancellable.
3352 /// \return The insertion point after the region
3353 InsertPointOrErrorTy
3354 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3355 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3356 FinalizeCallbackTy FiniCB, bool Conditional = false,
3357 bool HasFinalize = true, bool IsCancellable = false);
3358
3359 /// Get the platform-specific name separator.
3360 /// \param Parts different parts of the final name that needs separation
3361 /// \param FirstSeparator First separator used between the initial two
3362 /// parts of the name.
3363 /// \param Separator separator used between all of the rest consecutive
3364 /// parts of the name
3365 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3366 StringRef FirstSeparator,
3367 StringRef Separator);
3368
3369 /// Returns corresponding lock object for the specified critical region
3370 /// name. If the lock object does not exist it is created, otherwise the
3371 /// reference to the existing copy is returned.
3372 /// \param CriticalName Name of the critical region.
3373 ///
3374 Value *getOMPCriticalRegionLock(StringRef CriticalName);
3375
3376 /// Callback type for Atomic Expression update
3377 /// ex:
3378 /// \code{.cpp}
3379 /// unsigned x = 0;
3380 /// #pragma omp atomic update
3381 /// x = Expr(x_old); //Expr() is any legal operation
3382 /// \endcode
3383 ///
3384 /// \param XOld the value of the atomic memory address to use for update
3385 /// \param IRB reference to the IRBuilder to use
3386 ///
3387 /// \returns Value to update X to.
3388 using AtomicUpdateCallbackTy =
3389 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3390
3391private:
3392 enum AtomicKind { Read, Write, Update, Capture, Compare };
3393
3394 /// Determine whether to emit flush or not
3395 ///
3396 /// \param Loc The insert and source location description.
3397 /// \param AO The required atomic ordering
3398 /// \param AK The OpenMP atomic operation kind used.
3399 ///
3400 /// \returns wether a flush was emitted or not
3401 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3402 AtomicOrdering AO, AtomicKind AK);
3403
3404 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3405 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3406 /// Only Scalar data types.
3407 ///
3408 /// \param AllocaIP The insertion point to be used for alloca
3409 /// instructions.
3410 /// \param X The target atomic pointer to be updated
3411 /// \param XElemTy The element type of the atomic pointer.
3412 /// \param Expr The value to update X with.
3413 /// \param AO Atomic ordering of the generated atomic
3414 /// instructions.
3415 /// \param RMWOp The binary operation used for update. If
3416 /// operation is not supported by atomicRMW,
3417 /// or belong to {FADD, FSUB, BAD_BINOP}.
3418 /// Then a `cmpExch` based atomic will be generated.
3419 /// \param UpdateOp Code generator for complex expressions that cannot be
3420 /// expressed through atomicrmw instruction.
3421 /// \param VolatileX true if \a X volatile?
3422 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3423 /// update expression, false otherwise.
3424 /// (e.g. true for X = X BinOp Expr)
3425 ///
3426 /// \returns A pair of the old value of X before the update, and the value
3427 /// used for the update.
3428 Expected<std::pair<Value *, Value *>>
3429 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3430 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3431 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3432 bool IsXBinopExpr, bool IsIgnoreDenormalMode,
3433 bool IsFineGrainedMemory, bool IsRemoteMemory);
3434
3435 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3436 ///
3437 /// \Return The instruction
3438 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3439 AtomicRMWInst::BinOp RMWOp);
3440
3441 bool IsFinalized;
3442
3443public:
3444 /// a struct to pack relevant information while generating atomic Ops
3445 struct AtomicOpValue {
3446 Value *Var = nullptr;
3447 Type *ElemTy = nullptr;
3448 bool IsSigned = false;
3449 bool IsVolatile = false;
3450 };
3451
3452 /// Emit atomic Read for : V = X --- Only Scalar data types.
3453 ///
3454 /// \param Loc The insert and source location description.
3455 /// \param X The target pointer to be atomically read
3456 /// \param V Memory address where to store atomically read
3457 /// value
3458 /// \param AO Atomic ordering of the generated atomic
3459 /// instructions.
3460 /// \param AllocaIP Insert point for allocas
3461 //
3462 /// \return Insertion point after generated atomic read IR.
3463 LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc,
3464 AtomicOpValue &X, AtomicOpValue &V,
3465 AtomicOrdering AO,
3466 InsertPointTy AllocaIP);
3467
3468 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3469 ///
3470 /// \param Loc The insert and source location description.
3471 /// \param X The target pointer to be atomically written to
3472 /// \param Expr The value to store.
3473 /// \param AO Atomic ordering of the generated atomic
3474 /// instructions.
3475 /// \param AllocaIP Insert point for allocas
3476 ///
3477 /// \return Insertion point after generated atomic Write IR.
3478 LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3479 AtomicOpValue &X, Value *Expr,
3480 AtomicOrdering AO,
3481 InsertPointTy AllocaIP);
3482
3483 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3484 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3485 /// Only Scalar data types.
3486 ///
3487 /// \param Loc The insert and source location description.
3488 /// \param AllocaIP The insertion point to be used for alloca instructions.
3489 /// \param X The target atomic pointer to be updated
3490 /// \param Expr The value to update X with.
3491 /// \param AO Atomic ordering of the generated atomic instructions.
3492 /// \param RMWOp The binary operation used for update. If operation
3493 /// is not supported by atomicRMW, or belong to
3494 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3495 /// atomic will be generated.
3496 /// \param UpdateOp Code generator for complex expressions that cannot be
3497 /// expressed through atomicrmw instruction.
3498 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3499 /// update expression, false otherwise.
3500 /// (e.g. true for X = X BinOp Expr)
3501 ///
3502 /// \return Insertion point after generated atomic update IR.
3503 LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(
3504 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3505 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3506 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
3507 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3508 bool IsRemoteMemory = false);
3509
3510 /// Emit atomic update for constructs: --- Only Scalar data types
3511 /// V = X; X = X BinOp Expr ,
3512 /// X = X BinOp Expr; V = X,
3513 /// V = X; X = Expr BinOp X,
3514 /// X = Expr BinOp X; V = X,
3515 /// V = X; X = UpdateOp(X),
3516 /// X = UpdateOp(X); V = X,
3517 ///
3518 /// \param Loc The insert and source location description.
3519 /// \param AllocaIP The insertion point to be used for alloca instructions.
3520 /// \param X The target atomic pointer to be updated
3521 /// \param V Memory address where to store captured value
3522 /// \param Expr The value to update X with.
3523 /// \param AO Atomic ordering of the generated atomic instructions
3524 /// \param RMWOp The binary operation used for update. If
3525 /// operation is not supported by atomicRMW, or belong to
3526 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3527 /// atomic will be generated.
3528 /// \param UpdateOp Code generator for complex expressions that cannot be
3529 /// expressed through atomicrmw instruction.
3530 /// \param UpdateExpr true if X is an in place update of the form
3531 /// X = X BinOp Expr or X = Expr BinOp X
3532 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3533 /// update expression, false otherwise.
3534 /// (e.g. true for X = X BinOp Expr)
3535 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3536 /// 'v', not an updated one.
3537 ///
3538 /// \return Insertion point after generated atomic capture IR.
3539 LLVM_ABI InsertPointOrErrorTy createAtomicCapture(
3540 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3541 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
3542 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
3543 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
3544 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3545 bool IsRemoteMemory = false);
3546
3547 /// Emit atomic compare for constructs: --- Only scalar data types
3548 /// cond-expr-stmt:
3549 /// x = x ordop expr ? expr : x;
3550 /// x = expr ordop x ? expr : x;
3551 /// x = x == e ? d : x;
3552 /// x = e == x ? d : x; (this one is not in the spec)
3553 /// cond-update-stmt:
3554 /// if (x ordop expr) { x = expr; }
3555 /// if (expr ordop x) { x = expr; }
3556 /// if (x == e) { x = d; }
3557 /// if (e == x) { x = d; } (this one is not in the spec)
3558 /// conditional-update-capture-atomic:
3559 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3560 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3561 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3562 /// IsFailOnly=true)
3563 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3564 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3565 /// IsFailOnly=true)
3566 ///
3567 /// \param Loc The insert and source location description.
3568 /// \param X The target atomic pointer to be updated.
3569 /// \param V Memory address where to store captured value (for
3570 /// compare capture only).
3571 /// \param R Memory address where to store comparison result
3572 /// (for compare capture with '==' only).
3573 /// \param E The expected value ('e') for forms that use an
3574 /// equality comparison or an expression ('expr') for
3575 /// forms that use 'ordop' (logically an atomic maximum or
3576 /// minimum).
3577 /// \param D The desired value for forms that use an equality
3578 /// comparison. If forms that use 'ordop', it should be
3579 /// \p nullptr.
3580 /// \param AO Atomic ordering of the generated atomic instructions.
3581 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3582 /// \param IsXBinopExpr True if the conditional statement is in the form where
3583 /// x is on LHS. It only matters for < or >.
3584 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3585 /// 'v', not an updated one (for compare capture
3586 /// only).
3587 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3588 /// only when the comparison fails. This is only valid for
3589 /// the case the comparison is '=='.
3590 ///
3591 /// \return Insertion point after generated atomic capture IR.
3592 LLVM_ABI InsertPointTy
3593 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3594 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3595 AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3596 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3597 LLVM_ABI InsertPointTy createAtomicCompare(
3598 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
3599 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
3600 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
3601 bool IsFailOnly, AtomicOrdering Failure);
3602
3603 /// Create the control flow structure of a canonical OpenMP loop.
3604 ///
3605 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3606 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3607 /// IRBuilder location is not preserved.
3608 ///
3609 /// \param DL DebugLoc used for the instructions in the skeleton.
3610 /// \param TripCount Value to be used for the trip count.
3611 /// \param F Function in which to insert the BasicBlocks.
3612 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3613 /// typically the body itself.
3614 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3615 /// \param Name Base name used to derive BB
3616 /// and instruction names.
3617 ///
3618 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3619 LLVM_ABI CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3620 Function *F,
3621 BasicBlock *PreInsertBefore,
3622 BasicBlock *PostInsertBefore,
3623 const Twine &Name = {});
3624 /// OMP Offload Info Metadata name string
3625 const std::string ompOffloadInfoName = "omp_offload.info";
3626
3627 /// Loads all the offload entries information from the host IR
3628 /// metadata. This function is only meant to be used with device code
3629 /// generation.
3630 ///
3631 /// \param M Module to load Metadata info from. Module passed maybe
3632 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3633 LLVM_ABI void loadOffloadInfoMetadata(Module &M);
3634
3635 /// Loads all the offload entries information from the host IR
3636 /// metadata read from the file passed in as the HostFilePath argument. This
3637 /// function is only meant to be used with device code generation.
3638 ///
3639 /// \param HostFilePath The path to the host IR file,
3640 /// used to load in offload metadata for the device, allowing host and device
3641 /// to maintain the same metadata mapping.
3642 LLVM_ABI void loadOffloadInfoMetadata(vfs::FileSystem &VFS,
3643 StringRef HostFilePath);
3644
3645 /// Gets (if variable with the given name already exist) or creates
3646 /// internal global variable with the specified Name. The created variable has
3647 /// linkage CommonLinkage by default and is initialized by null value.
3648 /// \param Ty Type of the global variable. If it is exist already the type
3649 /// must be the same.
3650 /// \param Name Name of the variable.
3651 LLVM_ABI GlobalVariable *
3652 getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3653 unsigned AddressSpace = 0);
3654};
3655
3656/// Class to represented the control flow structure of an OpenMP canonical loop.
3657///
3658/// The control-flow structure is standardized for easy consumption by
3659/// directives associated with loops. For instance, the worksharing-loop
3660/// construct may change this control flow such that each loop iteration is
3661/// executed on only one thread. The constraints of a canonical loop in brief
3662/// are:
3663///
3664/// * The number of loop iterations must have been computed before entering the
3665/// loop.
3666///
3667/// * Has an (unsigned) logical induction variable that starts at zero and
3668/// increments by one.
3669///
3670/// * The loop's CFG itself has no side-effects. The OpenMP specification
3671/// itself allows side-effects, but the order in which they happen, including
3672/// how often or whether at all, is unspecified. We expect that the frontend
3673/// will emit those side-effect instructions somewhere (e.g. before the loop)
3674/// such that the CanonicalLoopInfo itself can be side-effect free.
3675///
3676/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3677/// execution of a loop body that satifies these constraints. It does NOT
3678/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3679/// CanonicalLoopInfo for such purposes.
3680///
3681/// The control flow can be described as follows:
3682///
3683/// Preheader
3684/// |
3685/// /-> Header
3686/// | |
3687/// | Cond---\
3688/// | | |
3689/// | Body |
3690/// | | | |
3691/// | <...> |
3692/// | | | |
3693/// \--Latch |
3694/// |
3695/// Exit
3696/// |
3697/// After
3698///
3699/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3700/// including) and end at AfterIP (at the After's first instruction, excluding).
3701/// That is, instructions in the Preheader and After blocks (except the
3702/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3703/// side-effects. Typically, the Preheader is used to compute the loop's trip
3704/// count. The instructions from BodyIP (at the Body block's first instruction,
3705/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3706/// control and thus can have side-effects. The body block is the single entry
3707/// point into the loop body, which may contain arbitrary control flow as long
3708/// as all control paths eventually branch to the Latch block.
3709///
3710/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3711/// Latch to guarantee that there is only a single edge to the latch. It would
3712/// make loop transformations easier to not needing to consider multiple
3713/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3714/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3715/// executes after each body iteration.
3716///
3717/// There must be no loop-carried dependencies through llvm::Values. This is
3718/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3719/// for the induction variable.
3720///
3721/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3722/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3723/// by assertOK(). They are expected to not be modified unless explicitly
3724/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3725/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3726/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3727/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3728/// anymore as its underlying control flow may not exist anymore.
3729/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3730/// may also return a new CanonicalLoopInfo that can be passed to other
3731/// loop-associated construct implementing methods. These loop-transforming
3732/// methods may either create a new CanonicalLoopInfo usually using
3733/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3734/// modify one of the input CanonicalLoopInfo and return it as representing the
3735/// modified loop. What is done is an implementation detail of
3736/// transformation-implementing method and callers should always assume that the
3737/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3738/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3739/// created by createCanonicalLoop, such that transforming methods do not have
3740/// to special case where the CanonicalLoopInfo originated from.
3741///
3742/// Generally, methods consuming CanonicalLoopInfo do not need an
3743/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3744/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3745/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3746/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3747/// any InsertPoint in the Preheader, After or Block can still be used after
3748/// calling such a method.
3749///
3750/// TODO: Provide mechanisms for exception handling and cancellation points.
3751///
3752/// Defined outside OpenMPIRBuilder because nested classes cannot be
3753/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3754class CanonicalLoopInfo {
3755 friend class OpenMPIRBuilder;
3756
3757private:
3758 BasicBlock *Header = nullptr;
3759 BasicBlock *Cond = nullptr;
3760 BasicBlock *Latch = nullptr;
3761 BasicBlock *Exit = nullptr;
3762
3763 // Hold the MLIR value for the `lastiter` of the canonical loop.
3764 Value *LastIter = nullptr;
3765
3766 /// Add the control blocks of this loop to \p BBs.
3767 ///
3768 /// This does not include any block from the body, including the one returned
3769 /// by getBody().
3770 ///
3771 /// FIXME: This currently includes the Preheader and After blocks even though
3772 /// their content is (mostly) not under CanonicalLoopInfo's control.
3773 /// Re-evaluated whether this makes sense.
3774 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3775
3776 /// Sets the number of loop iterations to the given value. This value must be
3777 /// valid in the condition block (i.e., defined in the preheader) and is
3778 /// interpreted as an unsigned integer.
3779 void setTripCount(Value *TripCount);
3780
3781 /// Replace all uses of the canonical induction variable in the loop body with
3782 /// a new one.
3783 ///
3784 /// The intended use case is to update the induction variable for an updated
3785 /// iteration space such that it can stay normalized in the 0...tripcount-1
3786 /// range.
3787 ///
3788 /// The \p Updater is called with the (presumable updated) current normalized
3789 /// induction variable and is expected to return the value that uses of the
3790 /// pre-updated induction values should use instead, typically dependent on
3791 /// the new induction variable. This is a lambda (instead of e.g. just passing
3792 /// the new value) to be able to distinguish the uses of the pre-updated
3793 /// induction variable and uses of the induction varible to compute the
3794 /// updated induction variable value.
3795 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3796
3797public:
3798 /// Sets the last iteration variable for this loop.
3799 void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); }
3800
3801 /// Returns the last iteration variable for this loop.
3802 /// Certain use-cases (like translation of linear clause) may access
3803 /// this variable even after a loop transformation. Hence, do not guard
3804 /// this getter function by `isValid`. It is the responsibility of the
3805 /// callee to ensure this functionality is not invoked by a non-outlined
3806 /// CanonicalLoopInfo object (in which case, `setLastIter` will never be
3807 /// invoked and `LastIter` will be by default `nullptr`).
3808 Value *getLastIter() { return LastIter; }
3809
3810 /// Returns whether this object currently represents the IR of a loop. If
3811 /// returning false, it may have been consumed by a loop transformation or not
3812 /// been intialized. Do not use in this case;
3813 bool isValid() const { return Header; }
3814
3815 /// The preheader ensures that there is only a single edge entering the loop.
3816 /// Code that must be execute before any loop iteration can be emitted here,
3817 /// such as computing the loop trip count and begin lifetime markers. Code in
3818 /// the preheader is not considered part of the canonical loop.
3819 LLVM_ABI BasicBlock *getPreheader() const;
3820
3821 /// The header is the entry for each iteration. In the canonical control flow,
3822 /// it only contains the PHINode for the induction variable.
3823 BasicBlock *getHeader() const {
3824 assert(isValid() && "Requires a valid canonical loop");
3825 return Header;
3826 }
3827
3828 /// The condition block computes whether there is another loop iteration. If
3829 /// yes, branches to the body; otherwise to the exit block.
3830 BasicBlock *getCond() const {
3831 assert(isValid() && "Requires a valid canonical loop");
3832 return Cond;
3833 }
3834
3835 /// The body block is the single entry for a loop iteration and not controlled
3836 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3837 /// eventually branch to the \p Latch block.
3838 BasicBlock *getBody() const {
3839 assert(isValid() && "Requires a valid canonical loop");
3840 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3841 }
3842
3843 /// Reaching the latch indicates the end of the loop body code. In the
3844 /// canonical control flow, it only contains the increment of the induction
3845 /// variable.
3846 BasicBlock *getLatch() const {
3847 assert(isValid() && "Requires a valid canonical loop");
3848 return Latch;
3849 }
3850
3851 /// Reaching the exit indicates no more iterations are being executed.
3852 BasicBlock *getExit() const {
3853 assert(isValid() && "Requires a valid canonical loop");
3854 return Exit;
3855 }
3856
3857 /// The after block is intended for clean-up code such as lifetime end
3858 /// markers. It is separate from the exit block to ensure, analogous to the
3859 /// preheader, it having just a single entry edge and being free from PHI
3860 /// nodes should there be multiple loop exits (such as from break
3861 /// statements/cancellations).
3862 BasicBlock *getAfter() const {
3863 assert(isValid() && "Requires a valid canonical loop");
3864 return Exit->getSingleSuccessor();
3865 }
3866
3867 /// Returns the llvm::Value containing the number of loop iterations. It must
3868 /// be valid in the preheader and always interpreted as an unsigned integer of
3869 /// any bit-width.
3870 Value *getTripCount() const {
3871 assert(isValid() && "Requires a valid canonical loop");
3872 Instruction *CmpI = &Cond->front();
3873 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3874 return CmpI->getOperand(1);
3875 }
3876
3877 /// Returns the instruction representing the current logical induction
3878 /// variable. Always unsigned, always starting at 0 with an increment of one.
3879 Instruction *getIndVar() const {
3880 assert(isValid() && "Requires a valid canonical loop");
3881 Instruction *IndVarPHI = &Header->front();
3882 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3883 return IndVarPHI;
3884 }
3885
3886 /// Return the type of the induction variable (and the trip count).
3887 Type *getIndVarType() const {
3888 assert(isValid() && "Requires a valid canonical loop");
3889 return getIndVar()->getType();
3890 }
3891
3892 /// Return the insertion point for user code before the loop.
3893 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
3894 assert(isValid() && "Requires a valid canonical loop");
3895 BasicBlock *Preheader = getPreheader();
3896 return {Preheader, std::prev(Preheader->end())};
3897 };
3898
3899 /// Return the insertion point for user code in the body.
3900 OpenMPIRBuilder::InsertPointTy getBodyIP() const {
3901 assert(isValid() && "Requires a valid canonical loop");
3902 BasicBlock *Body = getBody();
3903 return {Body, Body->begin()};
3904 };
3905
3906 /// Return the insertion point for user code after the loop.
3907 OpenMPIRBuilder::InsertPointTy getAfterIP() const {
3908 assert(isValid() && "Requires a valid canonical loop");
3909 BasicBlock *After = getAfter();
3910 return {After, After->begin()};
3911 };
3912
3913 Function *getFunction() const {
3914 assert(isValid() && "Requires a valid canonical loop");
3915 return Header->getParent();
3916 }
3917
3918 /// Consistency self-check.
3919 LLVM_ABI void assertOK() const;
3920
3921 /// Invalidate this loop. That is, the underlying IR does not fulfill the
3922 /// requirements of an OpenMP canonical loop anymore.
3923 LLVM_ABI void invalidate();
3924};
3925
3926/// ScanInfo holds the information to assist in lowering of Scan reduction.
3927/// Before lowering, the body of the for loop specifying scan reduction is
3928/// expected to have the following structure
3929///
3930/// Loop Body Entry
3931/// |
3932/// Code before the scan directive
3933/// |
3934/// Scan Directive
3935/// |
3936/// Code after the scan directive
3937/// |
3938/// Loop Body Exit
3939/// When `createCanonicalScanLoops` is executed, the bodyGen callback of it
3940/// transforms the body to:
3941///
3942/// Loop Body Entry
3943/// |
3944/// OMPScanDispatch
3945///
3946/// OMPBeforeScanBlock
3947/// |
3948/// OMPScanLoopExit
3949/// |
3950/// Loop Body Exit
3951///
3952/// The insert point is updated to the first insert point of OMPBeforeScanBlock.
3953/// It dominates the control flow of code generated until
3954/// scan directive is encountered and OMPAfterScanBlock dominates the
3955/// control flow of code generated after scan is encountered. The successor
3956/// of OMPScanDispatch can be OMPBeforeScanBlock or OMPAfterScanBlock based
3957/// on 1.whether it is in Input phase or Scan Phase , 2. whether it is an
3958/// exclusive or inclusive scan. This jump is added when `createScan` is
3959/// executed. If input loop is being generated, if it is inclusive scan,
3960/// `OMPAfterScanBlock` succeeds `OMPScanDispatch` , if exclusive,
3961/// `OMPBeforeScanBlock` succeeds `OMPDispatch` and vice versa for scan loop. At
3962/// the end of the input loop, temporary buffer is populated and at the
3963/// beginning of the scan loop, temporary buffer is read. After scan directive
3964/// is encountered, insertion point is updated to `OMPAfterScanBlock` as it is
3965/// expected to dominate the code after the scan directive. Both Before and
3966/// After scan blocks are succeeded by `OMPScanLoopExit`.
3967/// Temporary buffer allocations are done in `ScanLoopInit` block before the
3968/// lowering of for-loop. The results are copied back to reduction variable in
3969/// `ScanLoopFinish` block.
3970class ScanInfo {
3971public:
3972 /// Dominates the body of the loop before scan directive
3973 llvm::BasicBlock *OMPBeforeScanBlock = nullptr;
3974
3975 /// Dominates the body of the loop before scan directive
3976 llvm::BasicBlock *OMPAfterScanBlock = nullptr;
3977
3978 /// Controls the flow to before or after scan blocks
3979 llvm::BasicBlock *OMPScanDispatch = nullptr;
3980
3981 /// Exit block of loop body
3982 llvm::BasicBlock *OMPScanLoopExit = nullptr;
3983
3984 /// Block before loop body where scan initializations are done
3985 llvm::BasicBlock *OMPScanInit = nullptr;
3986
3987 /// Block after loop body where scan finalizations are done
3988 llvm::BasicBlock *OMPScanFinish = nullptr;
3989
3990 /// If true, it indicates Input phase is lowered; else it indicates
3991 /// ScanPhase is lowered
3992 bool OMPFirstScanLoop = false;
3993
3994 /// Maps the private reduction variable to the pointer of the temporary
3995 /// buffer
3996 llvm::SmallDenseMap<llvm::Value *, llvm::Value *> *ScanBuffPtrs;
3997
3998 /// Keeps track of value of iteration variable for input/scan loop to be
3999 /// used for Scan directive lowering
4000 llvm::Value *IV;
4001
4002 /// Stores the span of canonical loop being lowered to be used for temporary
4003 /// buffer allocation or Finalization.
4004 llvm::Value *Span;
4005
4006 ScanInfo() {
4007 ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
4008 }
4009
4010 ~ScanInfo() { delete (ScanBuffPtrs); }
4011};
4012
4013} // end namespace llvm
4014
4015#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
This file defines the BumpPtrAllocator interface.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
DXIL Finalize Linkage
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix, bool &Renamed)
Get the name of a profiling variable for a particular function.
bool operator<(const DeltaInfo &LHS, int64_t Delta)
Definition LineTable.cpp:30
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
#define T
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static uint32_t getFlags(const Symbol *Sym)
Definition TapiFile.cpp:26
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
@ None
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
LLVM Basic Block Representation.
Definition BasicBlock.h:62
A debug info location.
Definition DebugLoc.h:124
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
The virtual file system interface.
LLVM_ABI bool isGPU(const Module &M)
Return true iff M target a GPU (and we can use GPU AS reasoning).
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:60
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:456
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1665
FunctionAddr VTableAddr Count
Definition InstrProf.h:139