LLVM 22.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
20#include "llvm/IR/DebugLoc.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/Module.h"
23#include "llvm/IR/ValueMap.h"
26#include "llvm/Support/Error.h"
28#include <forward_list>
29#include <map>
30#include <optional>
31
32namespace llvm {
33class CanonicalLoopInfo;
34class ScanInfo;
35struct TargetRegionEntryInfo;
36class OffloadEntriesInfoManager;
37class OpenMPIRBuilder;
38class Loop;
39class LoopAnalysis;
40class LoopInfo;
41
42namespace vfs {
43class FileSystem;
44} // namespace vfs
45
46/// Move the instruction after an InsertPoint to the beginning of another
47/// BasicBlock.
48///
49/// The instructions after \p IP are moved to the beginning of \p New which must
50/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
51/// \p New will be added such that there is no semantic change. Otherwise, the
52/// \p IP insert block remains degenerate and it is up to the caller to insert a
53/// terminator. \p DL is used as the debug location for the branch instruction
54/// if one is created.
56 bool CreateBranch, DebugLoc DL);
57
58/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
59/// insert location will stick to after the instruction before the insertion
60/// point (instead of moving with the instruction the InsertPoint stores
61/// internally).
62LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New,
63 bool CreateBranch);
64
65/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
66/// (missing the terminator).
67///
68/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
69/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
70/// is true, a branch to the new successor will new created such that
71/// semantically there is no change; otherwise the block of the insertion point
72/// remains degenerate and it is the caller's responsibility to insert a
73/// terminator. \p DL is used as the debug location for the branch instruction
74/// if one is created. Returns the new successor block.
75LLVM_ABI BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
76 DebugLoc DL, llvm::Twine Name = {});
77
78/// Split a BasicBlock at \p Builder's insertion point, even if the block is
79/// degenerate (missing the terminator). Its new insert location will stick to
80/// after the instruction before the insertion point (instead of moving with the
81/// instruction the InsertPoint stores internally).
82LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
83 llvm::Twine Name = {});
84
85/// Split a BasicBlock at \p Builder's insertion point, even if the block is
86/// degenerate (missing the terminator). Its new insert location will stick to
87/// after the instruction before the insertion point (instead of moving with the
88/// instruction the InsertPoint stores internally).
89LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
90 llvm::Twine Name);
91
92/// Like splitBB, but reuses the current block's name for the new name.
93LLVM_ABI BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder,
94 bool CreateBranch,
95 llvm::Twine Suffix = ".split");
96
97/// Captures attributes that affect generating LLVM-IR using the
98/// OpenMPIRBuilder and related classes. Note that not all attributes are
99/// required for all classes or functions. In some use cases the configuration
100/// is not necessary at all, because because the only functions that are called
101/// are ones that are not dependent on the configuration.
102class OpenMPIRBuilderConfig {
103public:
104 /// Flag to define whether to generate code for the role of the OpenMP host
105 /// (if set to false) or device (if set to true) in an offloading context. It
106 /// is set when the -fopenmp-is-target-device compiler frontend option is
107 /// specified.
108 std::optional<bool> IsTargetDevice;
109
110 /// Flag for specifying if the compilation is done for an accelerator. It is
111 /// set according to the architecture of the target triple and currently only
112 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
113 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
114 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
115 /// like target with the ability to work as the OpenMP host is added, or if
116 /// the capabilities of the currently supported GPU architectures are
117 /// expanded.
118 std::optional<bool> IsGPU;
119
120 /// Flag for specifying if LLVMUsed information should be emitted.
121 std::optional<bool> EmitLLVMUsedMetaInfo;
122
123 /// Flag for specifying if offloading is mandatory.
124 std::optional<bool> OpenMPOffloadMandatory;
125
126 /// First separator used between the initial two parts of a name.
127 std::optional<StringRef> FirstSeparator;
128 /// Separator used between all of the rest consecutive parts of s name
129 std::optional<StringRef> Separator;
130
131 // Grid Value for the GPU target
132 std::optional<omp::GV> GridValue;
133
134 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
135 /// false`), this contains the list of offloading triples associated, if any.
136 SmallVector<Triple> TargetTriples;
137
138 LLVM_ABI OpenMPIRBuilderConfig();
139 LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
140 bool OpenMPOffloadMandatory,
141 bool HasRequiresReverseOffload,
142 bool HasRequiresUnifiedAddress,
143 bool HasRequiresUnifiedSharedMemory,
144 bool HasRequiresDynamicAllocators);
145
146 // Getters functions that assert if the required values are not present.
147 bool isTargetDevice() const {
148 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
149 return *IsTargetDevice;
150 }
151
152 bool isGPU() const {
153 assert(IsGPU.has_value() && "IsGPU is not set");
154 return *IsGPU;
155 }
156
157 bool openMPOffloadMandatory() const {
158 assert(OpenMPOffloadMandatory.has_value() &&
159 "OpenMPOffloadMandatory is not set");
160 return *OpenMPOffloadMandatory;
161 }
162
163 omp::GV getGridValue() const {
164 assert(GridValue.has_value() && "GridValue is not set");
165 return *GridValue;
166 }
167
168 bool hasRequiresFlags() const { return RequiresFlags; }
169 LLVM_ABI bool hasRequiresReverseOffload() const;
170 LLVM_ABI bool hasRequiresUnifiedAddress() const;
171 LLVM_ABI bool hasRequiresUnifiedSharedMemory() const;
172 LLVM_ABI bool hasRequiresDynamicAllocators() const;
173
174 /// Returns requires directive clauses as flags compatible with those expected
175 /// by libomptarget.
176 LLVM_ABI int64_t getRequiresFlags() const;
177
178 // Returns the FirstSeparator if set, otherwise use the default separator
179 // depending on isGPU
180 StringRef firstSeparator() const {
181 if (FirstSeparator.has_value())
182 return *FirstSeparator;
183 if (isGPU())
184 return "_";
185 return ".";
186 }
187
188 // Returns the Separator if set, otherwise use the default separator depending
189 // on isGPU
190 StringRef separator() const {
191 if (Separator.has_value())
192 return *Separator;
193 if (isGPU())
194 return "$";
195 return ".";
196 }
197
198 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
199 void setIsGPU(bool Value) { IsGPU = Value; }
200 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
201 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
202 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
203 void setSeparator(StringRef S) { Separator = S; }
204 void setGridValue(omp::GV G) { GridValue = G; }
205
206 LLVM_ABI void setHasRequiresReverseOffload(bool Value);
207 LLVM_ABI void setHasRequiresUnifiedAddress(bool Value);
208 LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value);
209 LLVM_ABI void setHasRequiresDynamicAllocators(bool Value);
210
211private:
212 /// Flags for specifying which requires directive clauses are present.
213 int64_t RequiresFlags;
214};
215
216/// Data structure to contain the information needed to uniquely identify
217/// a target entry.
218struct TargetRegionEntryInfo {
219 /// The prefix used for kernel names.
220 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
221
222 std::string ParentName;
223 unsigned DeviceID;
224 unsigned FileID;
225 unsigned Line;
226 unsigned Count;
227
228 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
229 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
230 unsigned FileID, unsigned Line, unsigned Count = 0)
231 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
232 Count(Count) {}
233
234 LLVM_ABI static void
235 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, StringRef ParentName,
236 unsigned DeviceID, unsigned FileID, unsigned Line,
237 unsigned Count);
238
239 bool operator<(const TargetRegionEntryInfo &RHS) const {
240 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
241 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
242 RHS.Count);
243 }
244};
245
246/// Class that manages information about offload code regions and data
247class OffloadEntriesInfoManager {
248 /// Number of entries registered so far.
249 OpenMPIRBuilder *OMPBuilder;
250 unsigned OffloadingEntriesNum = 0;
251
252public:
253 /// Base class of the entries info.
254 class OffloadEntryInfo {
255 public:
256 /// Kind of a given entry.
257 enum OffloadingEntryInfoKinds : unsigned {
258 /// Entry is a target region.
259 OffloadingEntryInfoTargetRegion = 0,
260 /// Entry is a declare target variable.
261 OffloadingEntryInfoDeviceGlobalVar = 1,
262 /// Invalid entry info.
263 OffloadingEntryInfoInvalid = ~0u
264 };
265
266 protected:
267 OffloadEntryInfo() = delete;
268 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
269 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
270 uint32_t Flags)
271 : Flags(Flags), Order(Order), Kind(Kind) {}
272 ~OffloadEntryInfo() = default;
273
274 public:
275 bool isValid() const { return Order != ~0u; }
276 unsigned getOrder() const { return Order; }
277 OffloadingEntryInfoKinds getKind() const { return Kind; }
278 uint32_t getFlags() const { return Flags; }
279 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
280 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
281 void setAddress(Constant *V) {
282 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
283 Addr = V;
284 }
285 static bool classof(const OffloadEntryInfo *Info) { return true; }
286
287 private:
288 /// Address of the entity that has to be mapped for offloading.
289 WeakTrackingVH Addr;
290
291 /// Flags associated with the device global.
292 uint32_t Flags = 0u;
293
294 /// Order this entry was emitted.
295 unsigned Order = ~0u;
296
297 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
298 };
299
300 /// Return true if a there are no entries defined.
301 LLVM_ABI bool empty() const;
302 /// Return number of entries defined so far.
303 unsigned size() const { return OffloadingEntriesNum; }
304
305 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
306
307 //
308 // Target region entries related.
309 //
310
311 /// Kind of the target registry entry.
312 enum OMPTargetRegionEntryKind : uint32_t {
313 /// Mark the entry as target region.
314 OMPTargetRegionEntryTargetRegion = 0x0,
315 };
316
317 /// Target region entries info.
318 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
319 /// Address that can be used as the ID of the entry.
320 Constant *ID = nullptr;
321
322 public:
323 OffloadEntryInfoTargetRegion()
324 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
325 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
326 Constant *ID,
327 OMPTargetRegionEntryKind Flags)
328 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
329 ID(ID) {
330 setAddress(Addr);
331 }
332
333 Constant *getID() const { return ID; }
334 void setID(Constant *V) {
335 assert(!ID && "ID has been set before!");
336 ID = V;
337 }
338 static bool classof(const OffloadEntryInfo *Info) {
339 return Info->getKind() == OffloadingEntryInfoTargetRegion;
340 }
341 };
342
343 /// Initialize target region entry.
344 /// This is ONLY needed for DEVICE compilation.
345 LLVM_ABI void
346 initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
347 unsigned Order);
348 /// Register target region entry.
349 LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
350 Constant *Addr, Constant *ID,
351 OMPTargetRegionEntryKind Flags);
352 /// Return true if a target region entry with the provided information
353 /// exists.
354 LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
355 bool IgnoreAddressId = false) const;
356
357 // Return the Name based on \a EntryInfo using the next available Count.
358 LLVM_ABI void
359 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
360 const TargetRegionEntryInfo &EntryInfo);
361
362 /// brief Applies action \a Action on all registered entries.
363 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
364 const OffloadEntryInfoTargetRegion &)>
365 OffloadTargetRegionEntryInfoActTy;
366 LLVM_ABI void
367 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
368
369 //
370 // Device global variable entries related.
371 //
372
373 /// Kind of the global variable entry..
374 enum OMPTargetGlobalVarEntryKind : uint32_t {
375 /// Mark the entry as a to declare target.
376 OMPTargetGlobalVarEntryTo = 0x0,
377 /// Mark the entry as a to declare target link.
378 OMPTargetGlobalVarEntryLink = 0x1,
379 /// Mark the entry as a declare target enter.
380 OMPTargetGlobalVarEntryEnter = 0x2,
381 /// Mark the entry as having no declare target entry kind.
382 OMPTargetGlobalVarEntryNone = 0x3,
383 /// Mark the entry as a declare target indirect global.
384 OMPTargetGlobalVarEntryIndirect = 0x8,
385 /// Mark the entry as a register requires global.
386 OMPTargetGlobalRegisterRequires = 0x10,
387 };
388
389 /// Kind of device clause for declare target variables
390 /// and functions
391 /// NOTE: Currently not used as a part of a variable entry
392 /// used for Flang and Clang to interface with the variable
393 /// related registration functions
394 enum OMPTargetDeviceClauseKind : uint32_t {
395 /// The target is marked for all devices
396 OMPTargetDeviceClauseAny = 0x0,
397 /// The target is marked for non-host devices
398 OMPTargetDeviceClauseNoHost = 0x1,
399 /// The target is marked for host devices
400 OMPTargetDeviceClauseHost = 0x2,
401 /// The target is marked as having no clause
402 OMPTargetDeviceClauseNone = 0x3
403 };
404
405 /// Device global variable entries info.
406 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
407 /// Type of the global variable.
408 int64_t VarSize;
409 GlobalValue::LinkageTypes Linkage;
410 const std::string VarName;
411
412 public:
413 OffloadEntryInfoDeviceGlobalVar()
414 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
415 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
416 OMPTargetGlobalVarEntryKind Flags)
417 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
418 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
419 int64_t VarSize,
420 OMPTargetGlobalVarEntryKind Flags,
421 GlobalValue::LinkageTypes Linkage,
422 const std::string &VarName)
423 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
424 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
425 setAddress(Addr);
426 }
427
428 int64_t getVarSize() const { return VarSize; }
429 StringRef getVarName() const { return VarName; }
430 void setVarSize(int64_t Size) { VarSize = Size; }
431 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
432 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
433 static bool classof(const OffloadEntryInfo *Info) {
434 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
435 }
436 };
437
438 /// Initialize device global variable entry.
439 /// This is ONLY used for DEVICE compilation.
440 LLVM_ABI void initializeDeviceGlobalVarEntryInfo(
441 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order);
442
443 /// Register device global variable entry.
444 LLVM_ABI void registerDeviceGlobalVarEntryInfo(
445 StringRef VarName, Constant *Addr, int64_t VarSize,
446 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage);
447 /// Checks if the variable with the given name has been registered already.
448 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
449 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
450 }
451 /// Applies action \a Action on all registered entries.
452 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
453 OffloadDeviceGlobalVarEntryInfoActTy;
454 LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(
455 const OffloadDeviceGlobalVarEntryInfoActTy &Action);
456
457private:
458 /// Return the count of entries at a particular source location.
459 unsigned
460 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
461
462 /// Update the count of entries at a particular source location.
463 void
464 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
465
466 static TargetRegionEntryInfo
467 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
468 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
469 EntryInfo.FileID, EntryInfo.Line, 0);
470 }
471
472 // Count of entries at a location.
473 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
474
475 // Storage for target region entries kind.
476 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
477 OffloadEntriesTargetRegionTy;
478 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
479 /// Storage for device global variable entries kind. The storage is to be
480 /// indexed by mangled name.
481 typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
482 OffloadEntriesDeviceGlobalVarTy;
483 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
484};
485
486/// An interface to create LLVM-IR for OpenMP directives.
487///
488/// Each OpenMP directive has a corresponding public generator method.
489class OpenMPIRBuilder {
490public:
491 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
492 /// not have an effect on \p M (see initialize)
493 OpenMPIRBuilder(Module &M)
494 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
495 T(M.getTargetTriple()), IsFinalized(false) {}
496 LLVM_ABI ~OpenMPIRBuilder();
497
498 class AtomicInfo : public llvm::AtomicInfo {
499 llvm::Value *AtomicVar;
500
501 public:
502 AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits,
503 uint64_t ValueSizeInBits, llvm::Align AtomicAlign,
504 llvm::Align ValueAlign, bool UseLibcall,
505 IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar)
506 : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits,
507 AtomicAlign, ValueAlign, UseLibcall, AllocaIP),
508 AtomicVar(AtomicVar) {}
509
510 llvm::Value *getAtomicPointer() const override { return AtomicVar; }
511 void decorateWithTBAA(llvm::Instruction *I) override {}
512 llvm::AllocaInst *CreateAlloca(llvm::Type *Ty,
513 const llvm::Twine &Name) const override {
514 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
515 allocaInst->setName(Name);
516 return allocaInst;
517 }
518 };
519 /// Initialize the internal state, this will put structures types and
520 /// potentially other helpers into the underlying module. Must be called
521 /// before any other method and only once! This internal state includes types
522 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
523 LLVM_ABI void initialize();
524
525 void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
526
527 /// Finalize the underlying module, e.g., by outlining regions.
528 /// \param Fn The function to be finalized. If not used,
529 /// all functions are finalized.
530 LLVM_ABI void finalize(Function *Fn = nullptr);
531
532 /// Check whether the finalize function has already run
533 /// \return true if the finalize function has already run
534 LLVM_ABI bool isFinalized();
535
536 /// Add attributes known for \p FnID to \p Fn.
537 LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
538
539 /// Type used throughout for insertion points.
540 using InsertPointTy = IRBuilder<>::InsertPoint;
541
542 /// Type used to represent an insertion point or an error value.
543 using InsertPointOrErrorTy = Expected<InsertPointTy>;
544
545 /// Get the create a name using the platform specific separators.
546 /// \param Parts parts of the final name that needs separation
547 /// The created name has a first separator between the first and second part
548 /// and a second separator between all other parts.
549 /// E.g. with FirstSeparator "$" and Separator "." and
550 /// parts: "p1", "p2", "p3", "p4"
551 /// The resulting name is "p1$p2.p3.p4"
552 /// The separators are retrieved from the OpenMPIRBuilderConfig.
553 LLVM_ABI std::string
554 createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
555
556 /// Callback type for variable finalization (think destructors).
557 ///
558 /// \param CodeGenIP is the insertion point at which the finalization code
559 /// should be placed.
560 ///
561 /// A finalize callback knows about all objects that need finalization, e.g.
562 /// destruction, when the scope of the currently generated construct is left
563 /// at the time, and location, the callback is invoked.
564 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
565
566 struct FinalizationInfo {
567 /// The finalization callback provided by the last in-flight invocation of
568 /// createXXXX for the directive of kind DK.
569 FinalizeCallbackTy FiniCB;
570
571 /// The directive kind of the innermost directive that has an associated
572 /// region which might require finalization when it is left.
573 omp::Directive DK;
574
575 /// Flag to indicate if the directive is cancellable.
576 bool IsCancellable;
577 };
578
579 /// Push a finalization callback on the finalization stack.
580 ///
581 /// NOTE: Temporary solution until Clang CG is gone.
582 void pushFinalizationCB(const FinalizationInfo &FI) {
583 FinalizationStack.push_back(FI);
584 }
585
586 /// Pop the last finalization callback from the finalization stack.
587 ///
588 /// NOTE: Temporary solution until Clang CG is gone.
589 void popFinalizationCB() { FinalizationStack.pop_back(); }
590
591 /// Callback type for body (=inner region) code generation
592 ///
593 /// The callback takes code locations as arguments, each describing a
594 /// location where additional instructions can be inserted.
595 ///
596 /// The CodeGenIP may be in the middle of a basic block or point to the end of
597 /// it. The basic block may have a terminator or be degenerate. The callback
598 /// function may just insert instructions at that position, but also split the
599 /// block (without the Before argument of BasicBlock::splitBasicBlock such
600 /// that the identify of the split predecessor block is preserved) and insert
601 /// additional control flow, including branches that do not lead back to what
602 /// follows the CodeGenIP. Note that since the callback is allowed to split
603 /// the block, callers must assume that InsertPoints to positions in the
604 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
605 /// such InsertPoints need to be preserved, it can split the block itself
606 /// before calling the callback.
607 ///
608 /// AllocaIP and CodeGenIP must not point to the same position.
609 ///
610 /// \param AllocaIP is the insertion point at which new alloca instructions
611 /// should be placed. The BasicBlock it is pointing to must
612 /// not be split.
613 /// \param CodeGenIP is the insertion point at which the body code should be
614 /// placed.
615 ///
616 /// \return an error, if any were triggered during execution.
617 using BodyGenCallbackTy =
618 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
619
620 // This is created primarily for sections construct as llvm::function_ref
621 // (BodyGenCallbackTy) is not storable (as described in the comments of
622 // function_ref class - function_ref contains non-ownable reference
623 // to the callable.
624 ///
625 /// \return an error, if any were triggered during execution.
626 using StorableBodyGenCallbackTy =
627 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
628
629 /// Callback type for loop body code generation.
630 ///
631 /// \param CodeGenIP is the insertion point where the loop's body code must be
632 /// placed. This will be a dedicated BasicBlock with a
633 /// conditional branch from the loop condition check and
634 /// terminated with an unconditional branch to the loop
635 /// latch.
636 /// \param IndVar is the induction variable usable at the insertion point.
637 ///
638 /// \return an error, if any were triggered during execution.
639 using LoopBodyGenCallbackTy =
640 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
641
642 /// Callback type for variable privatization (think copy & default
643 /// constructor).
644 ///
645 /// \param AllocaIP is the insertion point at which new alloca instructions
646 /// should be placed.
647 /// \param CodeGenIP is the insertion point at which the privatization code
648 /// should be placed.
649 /// \param Original The value being copied/created, should not be used in the
650 /// generated IR.
651 /// \param Inner The equivalent of \p Original that should be used in the
652 /// generated IR; this is equal to \p Original if the value is
653 /// a pointer and can thus be passed directly, otherwise it is
654 /// an equivalent but different value.
655 /// \param ReplVal The replacement value, thus a copy or new created version
656 /// of \p Inner.
657 ///
658 /// \returns The new insertion point where code generation continues and
659 /// \p ReplVal the replacement value.
660 using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy(
661 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
662 Value &Inner, Value *&ReplVal)>;
663
664 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
665 /// (filename, line, column, ...).
666 struct LocationDescription {
667 LocationDescription(const IRBuilderBase &IRB)
668 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
669 LocationDescription(const InsertPointTy &IP) : IP(IP) {}
670 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
671 : IP(IP), DL(DL) {}
672 InsertPointTy IP;
673 DebugLoc DL;
674 };
675
676 /// Emitter methods for OpenMP directives.
677 ///
678 ///{
679
680 /// Generator for '#omp barrier'
681 ///
682 /// \param Loc The location where the barrier directive was encountered.
683 /// \param Kind The kind of directive that caused the barrier.
684 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
685 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
686 /// should be checked and acted upon.
687 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
688 ///
689 /// \returns The insertion point after the barrier.
690 LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc,
691 omp::Directive Kind,
692 bool ForceSimpleCall = false,
693 bool CheckCancelFlag = true);
694
695 /// Generator for '#omp cancel'
696 ///
697 /// \param Loc The location where the directive was encountered.
698 /// \param IfCondition The evaluated 'if' clause expression, if any.
699 /// \param CanceledDirective The kind of directive that is cancled.
700 ///
701 /// \returns The insertion point after the barrier.
702 LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc,
703 Value *IfCondition,
704 omp::Directive CanceledDirective);
705
706 /// Generator for '#omp cancellation point'
707 ///
708 /// \param Loc The location where the directive was encountered.
709 /// \param CanceledDirective The kind of directive that is cancled.
710 ///
711 /// \returns The insertion point after the barrier.
712 LLVM_ABI InsertPointOrErrorTy createCancellationPoint(
713 const LocationDescription &Loc, omp::Directive CanceledDirective);
714
715 /// Creates a ScanInfo object, allocates and returns the pointer.
716 LLVM_ABI Expected<ScanInfo *> scanInfoInitialize();
717
718 /// Generator for '#omp parallel'
719 ///
720 /// \param Loc The insert and source location description.
721 /// \param AllocaIP The insertion points to be used for alloca instructions.
722 /// \param BodyGenCB Callback that will generate the region code.
723 /// \param PrivCB Callback to copy a given variable (think copy constructor).
724 /// \param FiniCB Callback to finalize variable copies.
725 /// \param IfCondition The evaluated 'if' clause expression, if any.
726 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
727 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
728 /// \param IsCancellable Flag to indicate a cancellable parallel region.
729 ///
730 /// \returns The insertion position *after* the parallel.
731 LLVM_ABI InsertPointOrErrorTy createParallel(
732 const LocationDescription &Loc, InsertPointTy AllocaIP,
733 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
734 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
735 omp::ProcBindKind ProcBind, bool IsCancellable);
736
737 /// Generator for the control flow structure of an OpenMP canonical loop.
738 ///
739 /// This generator operates on the logical iteration space of the loop, i.e.
740 /// the caller only has to provide a loop trip count of the loop as defined by
741 /// base language semantics. The trip count is interpreted as an unsigned
742 /// integer. The induction variable passed to \p BodyGenCB will be of the same
743 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
744 /// convert the logical iteration variable to the loop counter variable in the
745 /// loop body.
746 ///
747 /// \param Loc The insert and source location description. The insert
748 /// location can be between two instructions or the end of a
749 /// degenerate block (e.g. a BB under construction).
750 /// \param BodyGenCB Callback that will generate the loop body code.
751 /// \param TripCount Number of iterations the loop body is executed.
752 /// \param Name Base name used to derive BB and instruction names.
753 ///
754 /// \returns An object representing the created control flow structure which
755 /// can be used for loop-associated directives.
756 LLVM_ABI Expected<CanonicalLoopInfo *>
757 createCanonicalLoop(const LocationDescription &Loc,
758 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
759 const Twine &Name = "loop");
760
761 /// Generator for the control flow structure of an OpenMP canonical loops if
762 /// the parent directive has an `inscan` modifier specified.
763 /// If the `inscan` modifier is specified, the region of the parent is
764 /// expected to have a `scan` directive. Based on the clauses in
765 /// scan directive, the body of the loop is split into two loops: Input loop
766 /// and Scan Loop. Input loop contains the code generated for input phase of
767 /// scan and Scan loop contains the code generated for scan phase of scan.
768 /// From the bodyGen callback of these loops, `createScan` would be called
769 /// when a scan directive is encountered from the loop body. `createScan`
770 /// based on whether 1. inclusive or exclusive scan is specified and, 2. input
771 /// loop or scan loop is generated, lowers the body of the for loop
772 /// accordingly.
773 ///
774 /// \param Loc The insert and source location description.
775 /// \param BodyGenCB Callback that will generate the loop body code.
776 /// \param Start Value of the loop counter for the first iterations.
777 /// \param Stop Loop counter values past this will stop the loop.
778 /// \param Step Loop counter increment after each iteration; negative
779 /// means counting down.
780 /// \param IsSigned Whether Start, Stop and Step are signed integers.
781 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
782 /// counter.
783 /// \param ComputeIP Insertion point for instructions computing the trip
784 /// count. Can be used to ensure the trip count is available
785 /// at the outermost loop of a loop nest. If not set,
786 /// defaults to the preheader of the generated loop.
787 /// \param Name Base name used to derive BB and instruction names.
788 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
789 /// `ScanInfoInitialize`.
790 ///
791 /// \returns A vector containing Loop Info of Input Loop and Scan Loop.
792 LLVM_ABI Expected<SmallVector<llvm::CanonicalLoopInfo *>>
793 createCanonicalScanLoops(const LocationDescription &Loc,
794 LoopBodyGenCallbackTy BodyGenCB, Value *Start,
795 Value *Stop, Value *Step, bool IsSigned,
796 bool InclusiveStop, InsertPointTy ComputeIP,
797 const Twine &Name, ScanInfo *ScanRedInfo);
798
799 /// Calculate the trip count of a canonical loop.
800 ///
801 /// This allows specifying user-defined loop counter values using increment,
802 /// upper- and lower bounds. To disambiguate the terminology when counting
803 /// downwards, instead of lower bounds we use \p Start for the loop counter
804 /// value in the first body iteration.
805 ///
806 /// Consider the following limitations:
807 ///
808 /// * A loop counter space over all integer values of its bit-width cannot be
809 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
810 /// stored into an 8 bit integer):
811 ///
812 /// DO I = 0, 255, 1
813 ///
814 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
815 /// effectively counting downwards:
816 ///
817 /// for (uint8_t i = 100u; i > 0; i += 127u)
818 ///
819 ///
820 /// TODO: May need to add additional parameters to represent:
821 ///
822 /// * Allow representing downcounting with unsigned integers.
823 ///
824 /// * Sign of the step and the comparison operator might disagree:
825 ///
826 /// for (int i = 0; i < 42; i -= 1u)
827 ///
828 /// \param Loc The insert and source location description.
829 /// \param Start Value of the loop counter for the first iterations.
830 /// \param Stop Loop counter values past this will stop the loop.
831 /// \param Step Loop counter increment after each iteration; negative
832 /// means counting down.
833 /// \param IsSigned Whether Start, Stop and Step are signed integers.
834 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
835 /// counter.
836 /// \param Name Base name used to derive instruction names.
837 ///
838 /// \returns The value holding the calculated trip count.
839 LLVM_ABI Value *calculateCanonicalLoopTripCount(
840 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
841 bool IsSigned, bool InclusiveStop, const Twine &Name = "loop");
842
843 /// Generator for the control flow structure of an OpenMP canonical loop.
844 ///
845 /// Instead of a logical iteration space, this allows specifying user-defined
846 /// loop counter values using increment, upper- and lower bounds. To
847 /// disambiguate the terminology when counting downwards, instead of lower
848 /// bounds we use \p Start for the loop counter value in the first body
849 ///
850 /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
851 /// so limitations of that method apply here as well.
852 ///
853 /// \param Loc The insert and source location description.
854 /// \param BodyGenCB Callback that will generate the loop body code.
855 /// \param Start Value of the loop counter for the first iterations.
856 /// \param Stop Loop counter values past this will stop the loop.
857 /// \param Step Loop counter increment after each iteration; negative
858 /// means counting down.
859 /// \param IsSigned Whether Start, Stop and Step are signed integers.
860 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
861 /// counter.
862 /// \param ComputeIP Insertion point for instructions computing the trip
863 /// count. Can be used to ensure the trip count is available
864 /// at the outermost loop of a loop nest. If not set,
865 /// defaults to the preheader of the generated loop.
866 /// \param Name Base name used to derive BB and instruction names.
867 /// \param InScan Whether loop has a scan reduction specified.
868 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
869 /// `ScanInfoInitialize`.
870 ///
871 /// \returns An object representing the created control flow structure which
872 /// can be used for loop-associated directives.
873 LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop(
874 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
875 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
876 InsertPointTy ComputeIP = {}, const Twine &Name = "loop",
877 bool InScan = false, ScanInfo *ScanRedInfo = nullptr);
878
879 /// Collapse a loop nest into a single loop.
880 ///
881 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
882 /// that has the same number of innermost loop iterations as the origin loop
883 /// nest. The induction variables of the input loops are derived from the
884 /// collapsed loop's induction variable. This is intended to be used to
885 /// implement OpenMP's collapse clause. Before applying a directive,
886 /// collapseLoops normalizes a loop nest to contain only a single loop and the
887 /// directive's implementation does not need to handle multiple loops itself.
888 /// This does not remove the need to handle all loop nest handling by
889 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
890 /// modifier of the worksharing-loop directive.
891 ///
892 /// Example:
893 /// \code
894 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
895 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
896 /// body(i, j);
897 /// \endcode
898 ///
899 /// After collapsing with Loops={i,j}, the loop is changed to
900 /// \code
901 /// for (int ij = 0; ij < 63; ++ij) {
902 /// int i = ij / 9;
903 /// int j = ij % 9;
904 /// body(i, j);
905 /// }
906 /// \endcode
907 ///
908 /// In the current implementation, the following limitations apply:
909 ///
910 /// * All input loops have an induction variable of the same type.
911 ///
912 /// * The collapsed loop will have the same trip count integer type as the
913 /// input loops. Therefore it is possible that the collapsed loop cannot
914 /// represent all iterations of the input loops. For instance, assuming a
915 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
916 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
917 /// which cannot be represented in an 32-bit integer. Behavior is undefined
918 /// in this case.
919 ///
920 /// * The trip counts of every input loop must be available at \p ComputeIP.
921 /// Non-rectangular loops are not yet supported.
922 ///
923 /// * At each nest level, code between a surrounding loop and its nested loop
924 /// is hoisted into the loop body, and such code will be executed more
925 /// often than before collapsing (or not at all if any inner loop iteration
926 /// has a trip count of 0). This is permitted by the OpenMP specification.
927 ///
928 /// \param DL Debug location for instructions added for collapsing,
929 /// such as instructions to compute/derive the input loop's
930 /// induction variables.
931 /// \param Loops Loops in the loop nest to collapse. Loops are specified
932 /// from outermost-to-innermost and every control flow of a
933 /// loop's body must pass through its directly nested loop.
934 /// \param ComputeIP Where additional instruction that compute the collapsed
935 /// trip count. If not set, defaults to before the generated
936 /// loop.
937 ///
938 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
939 LLVM_ABI CanonicalLoopInfo *collapseLoops(DebugLoc DL,
940 ArrayRef<CanonicalLoopInfo *> Loops,
941 InsertPointTy ComputeIP);
942
943 /// Get the default alignment value for given target
944 ///
945 /// \param TargetTriple Target triple
946 /// \param Features StringMap which describes extra CPU features
947 LLVM_ABI static unsigned
948 getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
949 const StringMap<bool> &Features);
950
951 /// Retrieve (or create if non-existent) the address of a declare
952 /// target variable, used in conjunction with registerTargetGlobalVariable
953 /// to create declare target global variables.
954 ///
955 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
956 /// clause used in conjunction with the variable being registered (link,
957 /// to, enter).
958 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
959 /// clause used in conjunction with the variable being registered (nohost,
960 /// host, any)
961 /// \param IsDeclaration - boolean stating if the variable being registered
962 /// is a declaration-only and not a definition
963 /// \param IsExternallyVisible - boolean stating if the variable is externally
964 /// visible
965 /// \param EntryInfo - Unique entry information for the value generated
966 /// using getTargetEntryUniqueInfo, used to name generated pointer references
967 /// to the declare target variable
968 /// \param MangledName - the mangled name of the variable being registered
969 /// \param GeneratedRefs - references generated by invocations of
970 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
971 /// these are required by Clang for book keeping.
972 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
973 /// \param TargetTriple - The OpenMP device target triple we are compiling
974 /// for
975 /// \param LlvmPtrTy - The type of the variable we are generating or
976 /// retrieving an address for
977 /// \param GlobalInitializer - a lambda function which creates a constant
978 /// used for initializing a pointer reference to the variable in certain
979 /// cases. If a nullptr is passed, it will default to utilising the original
980 /// variable to initialize the pointer reference.
981 /// \param VariableLinkage - a lambda function which returns the variables
982 /// linkage type, if unspecified and a nullptr is given, it will instead
983 /// utilise the linkage stored on the existing global variable in the
984 /// LLVMModule.
985 LLVM_ABI Constant *getAddrOfDeclareTargetVar(
986 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
987 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
988 bool IsDeclaration, bool IsExternallyVisible,
989 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
990 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
991 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
992 std::function<Constant *()> GlobalInitializer,
993 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
994
995 /// Registers a target variable for device or host.
996 ///
997 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
998 /// clause used in conjunction with the variable being registered (link,
999 /// to, enter).
1000 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1001 /// clause used in conjunction with the variable being registered (nohost,
1002 /// host, any)
1003 /// \param IsDeclaration - boolean stating if the variable being registered
1004 /// is a declaration-only and not a definition
1005 /// \param IsExternallyVisible - boolean stating if the variable is externally
1006 /// visible
1007 /// \param EntryInfo - Unique entry information for the value generated
1008 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1009 /// to the declare target variable
1010 /// \param MangledName - the mangled name of the variable being registered
1011 /// \param GeneratedRefs - references generated by invocations of
1012 /// registerTargetGlobalVariable these are required by Clang for book
1013 /// keeping.
1014 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1015 /// \param TargetTriple - The OpenMP device target triple we are compiling
1016 /// for
1017 /// \param GlobalInitializer - a lambda function which creates a constant
1018 /// used for initializing a pointer reference to the variable in certain
1019 /// cases. If a nullptr is passed, it will default to utilising the original
1020 /// variable to initialize the pointer reference.
1021 /// \param VariableLinkage - a lambda function which returns the variables
1022 /// linkage type, if unspecified and a nullptr is given, it will instead
1023 /// utilise the linkage stored on the existing global variable in the
1024 /// LLVMModule.
1025 /// \param LlvmPtrTy - The type of the variable we are generating or
1026 /// retrieving an address for
1027 /// \param Addr - the original llvm value (addr) of the variable to be
1028 /// registered
1029 LLVM_ABI void registerTargetGlobalVariable(
1030 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
1031 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
1032 bool IsDeclaration, bool IsExternallyVisible,
1033 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1034 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1035 std::vector<Triple> TargetTriple,
1036 std::function<Constant *()> GlobalInitializer,
1037 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
1038 Type *LlvmPtrTy, Constant *Addr);
1039
1040 /// Get the offset of the OMP_MAP_MEMBER_OF field.
1041 LLVM_ABI unsigned getFlagMemberOffset();
1042
1043 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
1044 /// the position given.
1045 /// \param Position - A value indicating the position of the parent
1046 /// of the member in the kernel argument structure, often retrieved
1047 /// by the parents position in the combined information vectors used
1048 /// to generate the structure itself. Multiple children (member's of)
1049 /// with the same parent will use the same returned member flag.
1050 LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
1051
1052 /// Given an initial flag set, this function modifies it to contain
1053 /// the passed in MemberOfFlag generated from the getMemberOfFlag
1054 /// function. The results are dependent on the existing flag bits
1055 /// set in the original flag set.
1056 /// \param Flags - The original set of flags to be modified with the
1057 /// passed in MemberOfFlag.
1058 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
1059 /// slightly based on the getMemberOfFlag which adjusts the flag bits
1060 /// based on the members position in its parent.
1061 LLVM_ABI void
1062 setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
1063 omp::OpenMPOffloadMappingFlags MemberOfFlag);
1064
1065private:
1066 /// Modifies the canonical loop to be a statically-scheduled workshare loop
1067 /// which is executed on the device
1068 ///
1069 /// This takes a \p CLI representing a canonical loop, such as the one
1070 /// created by \see createCanonicalLoop and emits additional instructions to
1071 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1072 /// runtime function in the preheader to call OpenMP device rtl function
1073 /// which handles worksharing of loop body interations.
1074 ///
1075 /// \param DL Debug location for instructions added for the
1076 /// workshare-loop construct itself.
1077 /// \param CLI A descriptor of the canonical loop to workshare.
1078 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1079 /// preheader of the loop.
1080 /// \param LoopType Information about type of loop worksharing.
1081 /// It corresponds to type of loop workshare OpenMP pragma.
1082 ///
1083 /// \returns Point where to insert code after the workshare construct.
1084 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
1085 InsertPointTy AllocaIP,
1086 omp::WorksharingLoopType LoopType);
1087
1088 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
1089 ///
1090 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1091 /// created by \p createCanonicalLoop and emits additional instructions to
1092 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1093 /// runtime function in the preheader to obtain the loop bounds to be used in
1094 /// the current thread, updates the relevant instructions in the canonical
1095 /// loop and calls to an OpenMP runtime finalization function after the loop.
1096 ///
1097 /// \param DL Debug location for instructions added for the
1098 /// workshare-loop construct itself.
1099 /// \param CLI A descriptor of the canonical loop to workshare.
1100 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1101 /// preheader of the loop.
1102 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1103 /// the loop.
1104 /// \param LoopType Type of workshare loop.
1105 ///
1106 /// \returns Point where to insert code after the workshare construct.
1107 InsertPointOrErrorTy applyStaticWorkshareLoop(
1108 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1109 omp::WorksharingLoopType LoopType, bool NeedsBarrier);
1110
1111 /// Modifies the canonical loop a statically-scheduled workshare loop with a
1112 /// user-specified chunk size.
1113 ///
1114 /// \param DL Debug location for instructions added for the
1115 /// workshare-loop construct itself.
1116 /// \param CLI A descriptor of the canonical loop to workshare.
1117 /// \param AllocaIP An insertion point for Alloca instructions usable in
1118 /// the preheader of the loop.
1119 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1120 /// loop.
1121 /// \param ChunkSize The user-specified chunk size.
1122 ///
1123 /// \returns Point where to insert code after the workshare construct.
1124 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
1125 CanonicalLoopInfo *CLI,
1126 InsertPointTy AllocaIP,
1127 bool NeedsBarrier,
1128 Value *ChunkSize);
1129
1130 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1131 ///
1132 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1133 /// created by \p createCanonicalLoop and emits additional instructions to
1134 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1135 /// runtime function in the preheader to obtain, and then in each iteration
1136 /// to update the loop counter.
1137 ///
1138 /// \param DL Debug location for instructions added for the
1139 /// workshare-loop construct itself.
1140 /// \param CLI A descriptor of the canonical loop to workshare.
1141 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1142 /// preheader of the loop.
1143 /// \param SchedType Type of scheduling to be passed to the init function.
1144 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1145 /// the loop.
1146 /// \param Chunk The size of loop chunk considered as a unit when
1147 /// scheduling. If \p nullptr, defaults to 1.
1148 ///
1149 /// \returns Point where to insert code after the workshare construct.
1150 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1151 CanonicalLoopInfo *CLI,
1152 InsertPointTy AllocaIP,
1153 omp::OMPScheduleType SchedType,
1154 bool NeedsBarrier,
1155 Value *Chunk = nullptr);
1156
1157 /// Create alternative version of the loop to support if clause
1158 ///
1159 /// OpenMP if clause can require to generate second loop. This loop
1160 /// will be executed when if clause condition is not met. createIfVersion
1161 /// adds branch instruction to the copied loop if \p ifCond is not met.
1162 ///
1163 /// \param Loop Original loop which should be versioned.
1164 /// \param IfCond Value which corresponds to if clause condition
1165 /// \param VMap Value to value map to define relation between
1166 /// original and copied loop values and loop blocks.
1167 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1168 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1169 ValueMap<const Value *, WeakTrackingVH> &VMap,
1170 LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
1171 const Twine &NamePrefix = "");
1172
1173public:
1174 /// Modifies the canonical loop to be a workshare loop.
1175 ///
1176 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1177 /// created by \p createCanonicalLoop and emits additional instructions to
1178 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1179 /// runtime function in the preheader to obtain the loop bounds to be used in
1180 /// the current thread, updates the relevant instructions in the canonical
1181 /// loop and calls to an OpenMP runtime finalization function after the loop.
1182 ///
1183 /// The concrete transformation is done by applyStaticWorkshareLoop,
1184 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1185 /// on the value of \p SchedKind and \p ChunkSize.
1186 ///
1187 /// \param DL Debug location for instructions added for the
1188 /// workshare-loop construct itself.
1189 /// \param CLI A descriptor of the canonical loop to workshare.
1190 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1191 /// preheader of the loop.
1192 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1193 /// the loop.
1194 /// \param SchedKind Scheduling algorithm to use.
1195 /// \param ChunkSize The chunk size for the inner loop.
1196 /// \param HasSimdModifier Whether the simd modifier is present in the
1197 /// schedule clause.
1198 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1199 /// the schedule clause.
1200 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1201 /// present in the schedule clause.
1202 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1203 /// present.
1204 /// \param LoopType Information about type of loop worksharing.
1205 /// It corresponds to type of loop workshare OpenMP pragma.
1206 ///
1207 /// \returns Point where to insert code after the workshare construct.
1208 LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
1209 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1210 bool NeedsBarrier,
1211 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1212 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1213 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1214 bool HasOrderedClause = false,
1215 omp::WorksharingLoopType LoopType =
1216 omp::WorksharingLoopType::ForStaticLoop);
1217
1218 /// Tile a loop nest.
1219 ///
1220 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1221 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1222 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1223 /// of every loop and every tile sizes must be usable in the outermost
1224 /// loop's preheader. This implies that the loop nest is rectangular.
1225 ///
1226 /// Example:
1227 /// \code
1228 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1229 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1230 /// body(i, j);
1231 /// \endcode
1232 ///
1233 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1234 /// \code
1235 /// for (int i1 = 0; i1 < 3; ++i1)
1236 /// for (int j1 = 0; j1 < 2; ++j1)
1237 /// for (int i2 = 0; i2 < 5; ++i2)
1238 /// for (int j2 = 0; j2 < 7; ++j2)
1239 /// body(i1*3+i2, j1*3+j2);
1240 /// \endcode
1241 ///
1242 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1243 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1244 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1245 /// that are not multiples of the tile size. In the latter case the tile loop
1246 /// of the last floor-loop iteration will have fewer iterations than specified
1247 /// as its tile size.
1248 ///
1249 ///
1250 /// @param DL Debug location for instructions added by tiling, for
1251 /// instance the floor- and tile trip count computation.
1252 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1253 /// invalidated by this method, i.e. should not used after
1254 /// tiling.
1255 /// @param TileSizes For each loop in \p Loops, the tile size for that
1256 /// dimensions.
1257 ///
1258 /// \returns A list of generated loops. Contains twice as many loops as the
1259 /// input loop nest; the first half are the floor loops and the
1260 /// second half are the tile loops.
1261 LLVM_ABI std::vector<CanonicalLoopInfo *>
1262 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1263 ArrayRef<Value *> TileSizes);
1264
1265 /// Fully unroll a loop.
1266 ///
1267 /// Instead of unrolling the loop immediately (and duplicating its body
1268 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1269 /// metadata.
1270 ///
1271 /// \param DL Debug location for instructions added by unrolling.
1272 /// \param Loop The loop to unroll. The loop will be invalidated.
1273 LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1274
1275 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1276 /// using LLVM's LoopUnrollPass.
1277 ///
1278 /// \param DL Debug location for instructions added by unrolling.
1279 /// \param Loop The loop to unroll. The loop will be invalidated.
1280 LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1281
1282 /// Partially unroll a loop.
1283 ///
1284 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1285 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1286 /// needing the CanonicalLoopInfo allows more efficient code generation by
1287 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1288 /// A loop-associated directive applied to the unrolled loop needs to know the
1289 /// new trip count which means that if using a heuristically determined unroll
1290 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1291 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1292 /// but which assumes that some canonicalization has taken place (e.g.
1293 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1294 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1295 ///
1296 /// \param DL Debug location for instructions added by unrolling.
1297 /// \param Loop The loop to unroll. The loop will be invalidated.
1298 /// \param Factor The factor to unroll the loop by. A factor of 0
1299 /// indicates that a heuristic should be used to determine
1300 /// the unroll-factor.
1301 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1302 /// partially unrolled loop. Otherwise, uses loop metadata
1303 /// to defer unrolling to the LoopUnrollPass.
1304 LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
1305 int32_t Factor,
1306 CanonicalLoopInfo **UnrolledCLI);
1307
1308 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1309 /// is cloned. The metadata which prevents vectorization is added to
1310 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1311 /// to false.
1312 ///
1313 /// \param Loop The loop to simd-ize.
1314 /// \param AlignedVars The map which containts pairs of the pointer
1315 /// and its corresponding alignment.
1316 /// \param IfCond The value which corresponds to the if clause
1317 /// condition.
1318 /// \param Order The enum to map order clause.
1319 /// \param Simdlen The Simdlen length to apply to the simd loop.
1320 /// \param Safelen The Safelen length to apply to the simd loop.
1321 LLVM_ABI void applySimd(CanonicalLoopInfo *Loop,
1322 MapVector<Value *, Value *> AlignedVars,
1323 Value *IfCond, omp::OrderKind Order,
1324 ConstantInt *Simdlen, ConstantInt *Safelen);
1325
1326 /// Generator for '#omp flush'
1327 ///
1328 /// \param Loc The location where the flush directive was encountered
1329 LLVM_ABI void createFlush(const LocationDescription &Loc);
1330
1331 /// Generator for '#omp taskwait'
1332 ///
1333 /// \param Loc The location where the taskwait directive was encountered.
1334 LLVM_ABI void createTaskwait(const LocationDescription &Loc);
1335
1336 /// Generator for '#omp taskyield'
1337 ///
1338 /// \param Loc The location where the taskyield directive was encountered.
1339 LLVM_ABI void createTaskyield(const LocationDescription &Loc);
1340
1341 /// A struct to pack the relevant information for an OpenMP depend clause.
1342 struct DependData {
1343 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1344 Type *DepValueType;
1345 Value *DepVal;
1346 explicit DependData() = default;
1347 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1348 Value *DepVal)
1349 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1350 };
1351
1352 /// Generator for `#omp task`
1353 ///
1354 /// \param Loc The location where the task construct was encountered.
1355 /// \param AllocaIP The insertion point to be used for alloca instructions.
1356 /// \param BodyGenCB Callback that will generate the region code.
1357 /// \param Tied True if the task is tied, false if the task is untied.
1358 /// \param Final i1 value which is `true` if the task is final, `false` if the
1359 /// task is not final.
1360 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1361 /// task is generated, and the encountering thread must
1362 /// suspend the current task region, for which execution
1363 /// cannot be resumed until execution of the structured
1364 /// block that is associated with the generated task is
1365 /// completed.
1366 /// \param EventHandle If present, signifies the event handle as part of
1367 /// the detach clause
1368 /// \param Mergeable If the given task is `mergeable`
1369 /// \param priority `priority-value' specifies the execution order of the
1370 /// tasks that is generated by the construct
1371 LLVM_ABI InsertPointOrErrorTy
1372 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1373 BodyGenCallbackTy BodyGenCB, bool Tied = true,
1374 Value *Final = nullptr, Value *IfCondition = nullptr,
1375 SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1376 Value *EventHandle = nullptr, Value *Priority = nullptr);
1377
1378 /// Generator for the taskgroup construct
1379 ///
1380 /// \param Loc The location where the taskgroup construct was encountered.
1381 /// \param AllocaIP The insertion point to be used for alloca instructions.
1382 /// \param BodyGenCB Callback that will generate the region code.
1383 LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1384 InsertPointTy AllocaIP,
1385 BodyGenCallbackTy BodyGenCB);
1386
1387 using FileIdentifierInfoCallbackTy =
1388 std::function<std::tuple<std::string, uint64_t>()>;
1389
1390 /// Creates a unique info for a target entry when provided a filename and
1391 /// line number from.
1392 ///
1393 /// \param CallBack A callback function which should return filename the entry
1394 /// resides in as well as the line number for the target entry
1395 /// \param ParentName The name of the parent the target entry resides in, if
1396 /// any.
1397 LLVM_ABI static TargetRegionEntryInfo
1398 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1399 StringRef ParentName = "");
1400
1401 /// Enum class for the RedctionGen CallBack type to be used.
1402 enum class ReductionGenCBKind { Clang, MLIR };
1403
1404 /// ReductionGen CallBack for Clang
1405 ///
1406 /// \param CodeGenIP InsertPoint for CodeGen.
1407 /// \param Index Index of the ReductionInfo to generate code for.
1408 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1409 /// codegen, used for fixup later.
1410 /// \param RHSPtr Optionally used by Clang to
1411 /// return the RHSPtr it used for codegen, used for fixup later.
1412 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1413 /// Clang context may be old.
1414 using ReductionGenClangCBTy =
1415 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1416 Value **LHS, Value **RHS, Function *CurFn)>;
1417
1418 /// ReductionGen CallBack for MLIR
1419 ///
1420 /// \param CodeGenIP InsertPoint for CodeGen.
1421 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1422 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1423 using ReductionGenCBTy = std::function<InsertPointOrErrorTy(
1424 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1425
1426 /// Functions used to generate atomic reductions. Such functions take two
1427 /// Values representing pointers to LHS and RHS of the reduction, as well as
1428 /// the element type of these pointers. They are expected to atomically
1429 /// update the LHS to the reduced value.
1430 using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy(
1431 InsertPointTy, Type *, Value *, Value *)>;
1432
1433 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1434 enum class EvalKind { Scalar, Complex, Aggregate };
1435
1436 /// Information about an OpenMP reduction.
1437 struct ReductionInfo {
1438 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1439 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1440 ReductionGenClangCBTy ReductionGenClang,
1441 ReductionGenAtomicCBTy AtomicReductionGen)
1443 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1444 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1445 AtomicReductionGen(AtomicReductionGen) {}
1446 ReductionInfo(Value *PrivateVariable)
1447 : ElementType(nullptr), Variable(nullptr),
1448 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1449 ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
1450
1451 /// Reduction element type, must match pointee type of variable.
1453
1454 /// Reduction variable of pointer type.
1455 Value *Variable;
1456
1457 /// Thread-private partial reduction variable.
1458 Value *PrivateVariable;
1459
1460 /// Reduction evaluation kind - scalar, complex or aggregate.
1461 EvalKind EvaluationKind;
1462
1463 /// Callback for generating the reduction body. The IR produced by this will
1464 /// be used to combine two values in a thread-safe context, e.g., under
1465 /// lock or within the same thread, and therefore need not be atomic.
1466 ReductionGenCBTy ReductionGen;
1467
1468 /// Clang callback for generating the reduction body. The IR produced by
1469 /// this will be used to combine two values in a thread-safe context, e.g.,
1470 /// under lock or within the same thread, and therefore need not be atomic.
1471 ReductionGenClangCBTy ReductionGenClang;
1472
1473 /// Callback for generating the atomic reduction body, may be null. The IR
1474 /// produced by this will be used to atomically combine two values during
1475 /// reduction. If null, the implementation will use the non-atomic version
1476 /// along with the appropriate synchronization mechanisms.
1477 ReductionGenAtomicCBTy AtomicReductionGen;
1478 };
1479
1480 enum class CopyAction : unsigned {
1481 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1482 // the warp using shuffle instructions.
1483 RemoteLaneToThread,
1484 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1485 ThreadCopy,
1486 };
1487
1488 struct CopyOptionsTy {
1489 Value *RemoteLaneOffset = nullptr;
1490 Value *ScratchpadIndex = nullptr;
1491 Value *ScratchpadWidth = nullptr;
1492 };
1493
1494 /// Supporting functions for Reductions CodeGen.
1495private:
1496 /// Get the id of the current thread on the GPU.
1497 Value *getGPUThreadID();
1498
1499 /// Get the GPU warp size.
1500 Value *getGPUWarpSize();
1501
1502 /// Get the id of the warp in the block.
1503 /// We assume that the warp size is 32, which is always the case
1504 /// on the NVPTX device, to generate more efficient code.
1505 Value *getNVPTXWarpID();
1506
1507 /// Get the id of the current lane in the Warp.
1508 /// We assume that the warp size is 32, which is always the case
1509 /// on the NVPTX device, to generate more efficient code.
1510 Value *getNVPTXLaneID();
1511
1512 /// Cast value to the specified type.
1513 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1514
1515 /// This function creates calls to one of two shuffle functions to copy
1516 /// variables between lanes in a warp.
1517 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1518 Type *ElementType, Value *Offset);
1519
1520 /// Function to shuffle over the value from the remote lane.
1521 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1522 Type *ElementType, Value *Offset,
1523 Type *ReductionArrayTy);
1524
1525 /// Emit instructions to copy a Reduce list, which contains partially
1526 /// aggregated values, in the specified direction.
1527 void emitReductionListCopy(
1528 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1529 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1530 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1531
1532 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1533 /// in the same warp. It uses shuffle instructions to copy over data from
1534 /// a remote lane's stack. The reduction algorithm performed is specified
1535 /// by the fourth parameter.
1536 ///
1537 /// Algorithm Versions.
1538 /// Full Warp Reduce (argument value 0):
1539 /// This algorithm assumes that all 32 lanes are active and gathers
1540 /// data from these 32 lanes, producing a single resultant value.
1541 /// Contiguous Partial Warp Reduce (argument value 1):
1542 /// This algorithm assumes that only a *contiguous* subset of lanes
1543 /// are active. This happens for the last warp in a parallel region
1544 /// when the user specified num_threads is not an integer multiple of
1545 /// 32. This contiguous subset always starts with the zeroth lane.
1546 /// Partial Warp Reduce (argument value 2):
1547 /// This algorithm gathers data from any number of lanes at any position.
1548 /// All reduced values are stored in the lowest possible lane. The set
1549 /// of problems every algorithm addresses is a super set of those
1550 /// addressable by algorithms with a lower version number. Overhead
1551 /// increases as algorithm version increases.
1552 ///
1553 /// Terminology
1554 /// Reduce element:
1555 /// Reduce element refers to the individual data field with primitive
1556 /// data types to be combined and reduced across threads.
1557 /// Reduce list:
1558 /// Reduce list refers to a collection of local, thread-private
1559 /// reduce elements.
1560 /// Remote Reduce list:
1561 /// Remote Reduce list refers to a collection of remote (relative to
1562 /// the current thread) reduce elements.
1563 ///
1564 /// We distinguish between three states of threads that are important to
1565 /// the implementation of this function.
1566 /// Alive threads:
1567 /// Threads in a warp executing the SIMT instruction, as distinguished from
1568 /// threads that are inactive due to divergent control flow.
1569 /// Active threads:
1570 /// The minimal set of threads that has to be alive upon entry to this
1571 /// function. The computation is correct iff active threads are alive.
1572 /// Some threads are alive but they are not active because they do not
1573 /// contribute to the computation in any useful manner. Turning them off
1574 /// may introduce control flow overheads without any tangible benefits.
1575 /// Effective threads:
1576 /// In order to comply with the argument requirements of the shuffle
1577 /// function, we must keep all lanes holding data alive. But at most
1578 /// half of them perform value aggregation; we refer to this half of
1579 /// threads as effective. The other half is simply handing off their
1580 /// data.
1581 ///
1582 /// Procedure
1583 /// Value shuffle:
1584 /// In this step active threads transfer data from higher lane positions
1585 /// in the warp to lower lane positions, creating Remote Reduce list.
1586 /// Value aggregation:
1587 /// In this step, effective threads combine their thread local Reduce list
1588 /// with Remote Reduce list and store the result in the thread local
1589 /// Reduce list.
1590 /// Value copy:
1591 /// In this step, we deal with the assumption made by algorithm 2
1592 /// (i.e. contiguity assumption). When we have an odd number of lanes
1593 /// active, say 2k+1, only k threads will be effective and therefore k
1594 /// new values will be produced. However, the Reduce list owned by the
1595 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1596 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1597 /// that the contiguity assumption still holds.
1598 ///
1599 /// \param ReductionInfos Array type containing the ReductionOps.
1600 /// \param ReduceFn The reduction function.
1601 /// \param FuncAttrs Optional param to specify any function attributes that
1602 /// need to be copied to the new function.
1603 ///
1604 /// \return The ShuffleAndReduce function.
1605 Function *emitShuffleAndReduceFunction(
1606 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1607 Function *ReduceFn, AttributeList FuncAttrs);
1608
1609 /// Helper function for CreateCanonicalScanLoops to create InputLoop
1610 /// in the firstGen and Scan Loop in the SecondGen
1611 /// \param InputLoopGen Callback for generating the loop for input phase
1612 /// \param ScanLoopGen Callback for generating the loop for scan phase
1613 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1614 /// `ScanInfoInitialize`.
1615 ///
1616 /// \return error if any produced, else return success.
1617 Error emitScanBasedDirectiveIR(
1618 llvm::function_ref<Error()> InputLoopGen,
1619 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
1620 ScanInfo *ScanRedInfo);
1621
1622 /// Creates the basic blocks required for scan reduction.
1623 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1624 /// `ScanInfoInitialize`.
1625 void createScanBBs(ScanInfo *ScanRedInfo);
1626
1627 /// Dynamically allocates the buffer needed for scan reduction.
1628 /// \param AllocaIP The IP where possibly-shared pointer of buffer needs to
1629 /// be declared.
1630 /// \param ScanVars Scan Variables.
1631 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1632 /// `ScanInfoInitialize`.
1633 ///
1634 /// \return error if any produced, else return success.
1635 Error emitScanBasedDirectiveDeclsIR(InsertPointTy AllocaIP,
1636 ArrayRef<llvm::Value *> ScanVars,
1637 ArrayRef<llvm::Type *> ScanVarsType,
1638 ScanInfo *ScanRedInfo);
1639
1640 /// Copies the result back to the reduction variable.
1641 /// \param ReductionInfos Array type containing the ReductionOps.
1642 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1643 /// `ScanInfoInitialize`.
1644 ///
1645 /// \return error if any produced, else return success.
1646 Error emitScanBasedDirectiveFinalsIR(
1647 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1648 ScanInfo *ScanInfo);
1649
1650 /// This function emits a helper that gathers Reduce lists from the first
1651 /// lane of every active warp to lanes in the first warp.
1652 ///
1653 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1654 /// shared smem[warp_size];
1655 /// For all data entries D in reduce_data:
1656 /// sync
1657 /// If (I am the first lane in each warp)
1658 /// Copy my local D to smem[warp_id]
1659 /// sync
1660 /// if (I am the first warp)
1661 /// Copy smem[thread_id] to my local D
1662 ///
1663 /// \param Loc The insert and source location description.
1664 /// \param ReductionInfos Array type containing the ReductionOps.
1665 /// \param FuncAttrs Optional param to specify any function attributes that
1666 /// need to be copied to the new function.
1667 ///
1668 /// \return The InterWarpCopy function.
1669 Expected<Function *>
1670 emitInterWarpCopyFunction(const LocationDescription &Loc,
1671 ArrayRef<ReductionInfo> ReductionInfos,
1672 AttributeList FuncAttrs);
1673
1674 /// This function emits a helper that copies all the reduction variables from
1675 /// the team into the provided global buffer for the reduction variables.
1676 ///
1677 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1678 /// For all data entries D in reduce_data:
1679 /// Copy local D to buffer.D[Idx]
1680 ///
1681 /// \param ReductionInfos Array type containing the ReductionOps.
1682 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1683 /// \param FuncAttrs Optional param to specify any function attributes that
1684 /// need to be copied to the new function.
1685 ///
1686 /// \return The ListToGlobalCopy function.
1687 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1688 Type *ReductionsBufferTy,
1689 AttributeList FuncAttrs);
1690
1691 /// This function emits a helper that copies all the reduction variables from
1692 /// the team into the provided global buffer for the reduction variables.
1693 ///
1694 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1695 /// For all data entries D in reduce_data:
1696 /// Copy buffer.D[Idx] to local D;
1697 ///
1698 /// \param ReductionInfos Array type containing the ReductionOps.
1699 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1700 /// \param FuncAttrs Optional param to specify any function attributes that
1701 /// need to be copied to the new function.
1702 ///
1703 /// \return The GlobalToList function.
1704 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1705 Type *ReductionsBufferTy,
1706 AttributeList FuncAttrs);
1707
1708 /// This function emits a helper that reduces all the reduction variables from
1709 /// the team into the provided global buffer for the reduction variables.
1710 ///
1711 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1712 /// void *GlobPtrs[];
1713 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1714 /// ...
1715 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1716 /// reduce_function(GlobPtrs, reduce_data);
1717 ///
1718 /// \param ReductionInfos Array type containing the ReductionOps.
1719 /// \param ReduceFn The reduction function.
1720 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1721 /// \param FuncAttrs Optional param to specify any function attributes that
1722 /// need to be copied to the new function.
1723 ///
1724 /// \return The ListToGlobalReduce function.
1725 Function *
1726 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1727 Function *ReduceFn, Type *ReductionsBufferTy,
1728 AttributeList FuncAttrs);
1729
1730 /// This function emits a helper that reduces all the reduction variables from
1731 /// the team into the provided global buffer for the reduction variables.
1732 ///
1733 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1734 /// void *GlobPtrs[];
1735 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1736 /// ...
1737 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1738 /// reduce_function(reduce_data, GlobPtrs);
1739 ///
1740 /// \param ReductionInfos Array type containing the ReductionOps.
1741 /// \param ReduceFn The reduction function.
1742 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1743 /// \param FuncAttrs Optional param to specify any function attributes that
1744 /// need to be copied to the new function.
1745 ///
1746 /// \return The GlobalToListReduce function.
1747 Function *
1748 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1749 Function *ReduceFn, Type *ReductionsBufferTy,
1750 AttributeList FuncAttrs);
1751
1752 /// Get the function name of a reduction function.
1753 std::string getReductionFuncName(StringRef Name) const;
1754
1755 /// Emits reduction function.
1756 /// \param ReducerName Name of the function calling the reduction.
1757 /// \param ReductionInfos Array type containing the ReductionOps.
1758 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1759 /// CodeGenCB kind.
1760 /// \param FuncAttrs Optional param to specify any function attributes that
1761 /// need to be copied to the new function.
1762 ///
1763 /// \return The reduction function.
1764 Expected<Function *> createReductionFunction(
1765 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1766 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1767 AttributeList FuncAttrs = {});
1768
1769public:
1770 ///
1771 /// Design of OpenMP reductions on the GPU
1772 ///
1773 /// Consider a typical OpenMP program with one or more reduction
1774 /// clauses:
1775 ///
1776 /// float foo;
1777 /// double bar;
1778 /// #pragma omp target teams distribute parallel for \
1779 /// reduction(+:foo) reduction(*:bar)
1780 /// for (int i = 0; i < N; i++) {
1781 /// foo += A[i]; bar *= B[i];
1782 /// }
1783 ///
1784 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1785 /// all teams. In our OpenMP implementation on the NVPTX device an
1786 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1787 /// within a team are mapped to CUDA threads within a threadblock.
1788 /// Our goal is to efficiently aggregate values across all OpenMP
1789 /// threads such that:
1790 ///
1791 /// - the compiler and runtime are logically concise, and
1792 /// - the reduction is performed efficiently in a hierarchical
1793 /// manner as follows: within OpenMP threads in the same warp,
1794 /// across warps in a threadblock, and finally across teams on
1795 /// the NVPTX device.
1796 ///
1797 /// Introduction to Decoupling
1798 ///
1799 /// We would like to decouple the compiler and the runtime so that the
1800 /// latter is ignorant of the reduction variables (number, data types)
1801 /// and the reduction operators. This allows a simpler interface
1802 /// and implementation while still attaining good performance.
1803 ///
1804 /// Pseudocode for the aforementioned OpenMP program generated by the
1805 /// compiler is as follows:
1806 ///
1807 /// 1. Create private copies of reduction variables on each OpenMP
1808 /// thread: 'foo_private', 'bar_private'
1809 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1810 /// to it and writes the result in 'foo_private' and 'bar_private'
1811 /// respectively.
1812 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1813 /// and store the result on the team master:
1814 ///
1815 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1816 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1817 ///
1818 /// where:
1819 /// struct ReduceData {
1820 /// double *foo;
1821 /// double *bar;
1822 /// } reduceData
1823 /// reduceData.foo = &foo_private
1824 /// reduceData.bar = &bar_private
1825 ///
1826 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1827 /// auxiliary functions generated by the compiler that operate on
1828 /// variables of type 'ReduceData'. They aid the runtime perform
1829 /// algorithmic steps in a data agnostic manner.
1830 ///
1831 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1832 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1833 /// same warp. It takes the following arguments as input:
1834 ///
1835 /// a. variable of type 'ReduceData' on the calling lane,
1836 /// b. its lane_id,
1837 /// c. an offset relative to the current lane_id to generate a
1838 /// remote_lane_id. The remote lane contains the second
1839 /// variable of type 'ReduceData' that is to be reduced.
1840 /// d. an algorithm version parameter determining which reduction
1841 /// algorithm to use.
1842 ///
1843 /// 'shuffleReduceFn' retrieves data from the remote lane using
1844 /// efficient GPU shuffle intrinsics and reduces, using the
1845 /// algorithm specified by the 4th parameter, the two operands
1846 /// element-wise. The result is written to the first operand.
1847 ///
1848 /// Different reduction algorithms are implemented in different
1849 /// runtime functions, all calling 'shuffleReduceFn' to perform
1850 /// the essential reduction step. Therefore, based on the 4th
1851 /// parameter, this function behaves slightly differently to
1852 /// cooperate with the runtime to ensure correctness under
1853 /// different circumstances.
1854 ///
1855 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1856 /// reduced variables across warps. It tunnels, through CUDA
1857 /// shared memory, the thread-private data of type 'ReduceData'
1858 /// from lane 0 of each warp to a lane in the first warp.
1859 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1860 /// The last team writes the global reduced value to memory.
1861 ///
1862 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1863 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1864 /// scratchpadCopyFn, loadAndReduceFn)
1865 ///
1866 /// 'scratchpadCopyFn' is a helper that stores reduced
1867 /// data from the team master to a scratchpad array in
1868 /// global memory.
1869 ///
1870 /// 'loadAndReduceFn' is a helper that loads data from
1871 /// the scratchpad array and reduces it with the input
1872 /// operand.
1873 ///
1874 /// These compiler generated functions hide address
1875 /// calculation and alignment information from the runtime.
1876 /// 5. if ret == 1:
1877 /// The team master of the last team stores the reduced
1878 /// result to the globals in memory.
1879 /// foo += reduceData.foo; bar *= reduceData.bar
1880 ///
1881 ///
1882 /// Warp Reduction Algorithms
1883 ///
1884 /// On the warp level, we have three algorithms implemented in the
1885 /// OpenMP runtime depending on the number of active lanes:
1886 ///
1887 /// Full Warp Reduction
1888 ///
1889 /// The reduce algorithm within a warp where all lanes are active
1890 /// is implemented in the runtime as follows:
1891 ///
1892 /// full_warp_reduce(void *reduce_data,
1893 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1894 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1895 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1896 /// }
1897 ///
1898 /// The algorithm completes in log(2, WARPSIZE) steps.
1899 ///
1900 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1901 /// not used therefore we save instructions by not retrieving lane_id
1902 /// from the corresponding special registers. The 4th parameter, which
1903 /// represents the version of the algorithm being used, is set to 0 to
1904 /// signify full warp reduction.
1905 ///
1906 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1907 ///
1908 /// #reduce_elem refers to an element in the local lane's data structure
1909 /// #remote_elem is retrieved from a remote lane
1910 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1911 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1912 ///
1913 /// Contiguous Partial Warp Reduction
1914 ///
1915 /// This reduce algorithm is used within a warp where only the first
1916 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1917 /// number of OpenMP threads in a parallel region is not a multiple of
1918 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1919 ///
1920 /// void
1921 /// contiguous_partial_reduce(void *reduce_data,
1922 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1923 /// int size, int lane_id) {
1924 /// int curr_size;
1925 /// int offset;
1926 /// curr_size = size;
1927 /// mask = curr_size/2;
1928 /// while (offset>0) {
1929 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1930 /// curr_size = (curr_size+1)/2;
1931 /// offset = curr_size/2;
1932 /// }
1933 /// }
1934 ///
1935 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1936 ///
1937 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1938 /// if (lane_id < offset)
1939 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1940 /// else
1941 /// reduce_elem = remote_elem
1942 ///
1943 /// This algorithm assumes that the data to be reduced are located in a
1944 /// contiguous subset of lanes starting from the first. When there is
1945 /// an odd number of active lanes, the data in the last lane is not
1946 /// aggregated with any other lane's dat but is instead copied over.
1947 ///
1948 /// Dispersed Partial Warp Reduction
1949 ///
1950 /// This algorithm is used within a warp when any discontiguous subset of
1951 /// lanes are active. It is used to implement the reduction operation
1952 /// across lanes in an OpenMP simd region or in a nested parallel region.
1953 ///
1954 /// void
1955 /// dispersed_partial_reduce(void *reduce_data,
1956 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1957 /// int size, remote_id;
1958 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1959 /// do {
1960 /// remote_id = next_active_lane_id_right_after_me();
1961 /// # the above function returns 0 of no active lane
1962 /// # is present right after the current lane.
1963 /// size = number_of_active_lanes_in_this_warp();
1964 /// logical_lane_id /= 2;
1965 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1966 /// remote_id-1-threadIdx.x, 2);
1967 /// } while (logical_lane_id % 2 == 0 && size > 1);
1968 /// }
1969 ///
1970 /// There is no assumption made about the initial state of the reduction.
1971 /// Any number of lanes (>=1) could be active at any position. The reduction
1972 /// result is returned in the first active lane.
1973 ///
1974 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1975 ///
1976 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1977 /// if (lane_id % 2 == 0 && offset > 0)
1978 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1979 /// else
1980 /// reduce_elem = remote_elem
1981 ///
1982 ///
1983 /// Intra-Team Reduction
1984 ///
1985 /// This function, as implemented in the runtime call
1986 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1987 /// threads in a team. It first reduces within a warp using the
1988 /// aforementioned algorithms. We then proceed to gather all such
1989 /// reduced values at the first warp.
1990 ///
1991 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1992 /// data from each of the "warp master" (zeroth lane of each warp, where
1993 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1994 /// a mathematical sense) the problem of reduction across warp masters in
1995 /// a block to the problem of warp reduction.
1996 ///
1997 ///
1998 /// Inter-Team Reduction
1999 ///
2000 /// Once a team has reduced its data to a single value, it is stored in
2001 /// a global scratchpad array. Since each team has a distinct slot, this
2002 /// can be done without locking.
2003 ///
2004 /// The last team to write to the scratchpad array proceeds to reduce the
2005 /// scratchpad array. One or more workers in the last team use the helper
2006 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2007 /// the k'th worker reduces every k'th element.
2008 ///
2009 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2010 /// reduce across workers and compute a globally reduced value.
2011 ///
2012 /// \param Loc The location where the reduction was
2013 /// encountered. Must be within the associate
2014 /// directive and after the last local access to the
2015 /// reduction variables.
2016 /// \param AllocaIP An insertion point suitable for allocas usable
2017 /// in reductions.
2018 /// \param CodeGenIP An insertion point suitable for code
2019 /// generation. \param ReductionInfos A list of info on each reduction
2020 /// variable. \param IsNoWait Optional flag set if the reduction is
2021 /// marked as
2022 /// nowait.
2023 /// \param IsTeamsReduction Optional flag set if it is a teams
2024 /// reduction.
2025 /// \param GridValue Optional GPU grid value.
2026 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
2027 /// used for teams reduction.
2028 /// \param SrcLocInfo Source location information global.
2029 LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
2030 const LocationDescription &Loc, InsertPointTy AllocaIP,
2031 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
2032 bool IsNoWait = false, bool IsTeamsReduction = false,
2033 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
2034 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
2035 Value *SrcLocInfo = nullptr);
2036
2037 // TODO: provide atomic and non-atomic reduction generators for reduction
2038 // operators defined by the OpenMP specification.
2039
2040 /// Generator for '#omp reduction'.
2041 ///
2042 /// Emits the IR instructing the runtime to perform the specific kind of
2043 /// reductions. Expects reduction variables to have been privatized and
2044 /// initialized to reduction-neutral values separately. Emits the calls to
2045 /// runtime functions as well as the reduction function and the basic blocks
2046 /// performing the reduction atomically and non-atomically.
2047 ///
2048 /// The code emitted for the following:
2049 ///
2050 /// \code
2051 /// type var_1;
2052 /// type var_2;
2053 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
2054 /// /* body */;
2055 /// \endcode
2056 ///
2057 /// corresponds to the following sketch.
2058 ///
2059 /// \code
2060 /// void _outlined_par() {
2061 /// // N is the number of different reductions.
2062 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
2063 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
2064 /// _omp_reduction_func,
2065 /// _gomp_critical_user.reduction.var)) {
2066 /// case 1: {
2067 /// var_1 = var_1 <reduction-op> privatized_var_1;
2068 /// var_2 = var_2 <reduction-op> privatized_var_2;
2069 /// // ...
2070 /// __kmpc_end_reduce(...);
2071 /// break;
2072 /// }
2073 /// case 2: {
2074 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
2075 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
2076 /// // ...
2077 /// break;
2078 /// }
2079 /// default: break;
2080 /// }
2081 /// }
2082 ///
2083 /// void _omp_reduction_func(void **lhs, void **rhs) {
2084 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
2085 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
2086 /// // ...
2087 /// }
2088 /// \endcode
2089 ///
2090 /// \param Loc The location where the reduction was
2091 /// encountered. Must be within the associate
2092 /// directive and after the last local access to the
2093 /// reduction variables.
2094 /// \param AllocaIP An insertion point suitable for allocas usable
2095 /// in reductions.
2096 /// \param ReductionInfos A list of info on each reduction variable.
2097 /// \param IsNoWait A flag set if the reduction is marked as nowait.
2098 /// \param IsByRef A flag set if the reduction is using reference
2099 /// or direct value.
2100 /// \param IsTeamsReduction Optional flag set if it is a teams
2101 /// reduction.
2102 LLVM_ABI InsertPointOrErrorTy createReductions(
2103 const LocationDescription &Loc, InsertPointTy AllocaIP,
2104 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
2105 bool IsNoWait = false, bool IsTeamsReduction = false);
2106
2107 ///}
2108
2109 /// Return the insertion point used by the underlying IRBuilder.
2110 InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
2111
2112 /// Update the internal location to \p Loc.
2113 bool updateToLocation(const LocationDescription &Loc) {
2114 Builder.restoreIP(Loc.IP);
2115 Builder.SetCurrentDebugLocation(Loc.DL);
2116 return Loc.IP.getBlock() != nullptr;
2117 }
2118
2119 /// Return the function declaration for the runtime function with \p FnID.
2120 LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M,
2121 omp::RuntimeFunction FnID);
2122
2123 LLVM_ABI Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
2124
2125 /// Return the (LLVM-IR) string describing the source location \p LocStr.
2126 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef LocStr,
2127 uint32_t &SrcLocStrSize);
2128
2129 /// Return the (LLVM-IR) string describing the default source location.
2130 LLVM_ABI Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
2131
2132 /// Return the (LLVM-IR) string describing the source location identified by
2133 /// the arguments.
2134 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef FunctionName,
2135 StringRef FileName, unsigned Line,
2136 unsigned Column,
2137 uint32_t &SrcLocStrSize);
2138
2139 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
2140 /// fallback if \p DL does not specify the function name.
2141 LLVM_ABI Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
2142 Function *F = nullptr);
2143
2144 /// Return the (LLVM-IR) string describing the source location \p Loc.
2145 LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2146 uint32_t &SrcLocStrSize);
2147
2148 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2149 /// TODO: Create a enum class for the Reserve2Flags
2150 LLVM_ABI Constant *getOrCreateIdent(Constant *SrcLocStr,
2151 uint32_t SrcLocStrSize,
2152 omp::IdentFlag Flags = omp::IdentFlag(0),
2153 unsigned Reserve2Flags = 0);
2154
2155 /// Create a hidden global flag \p Name in the module with initial value \p
2156 /// Value.
2157 LLVM_ABI GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
2158
2159 /// Emit the llvm.used metadata.
2160 LLVM_ABI void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
2161
2162 /// Emit the kernel execution mode.
2163 LLVM_ABI GlobalVariable *
2164 emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode);
2165
2166 /// Generate control flow and cleanup for cancellation.
2167 ///
2168 /// \param CancelFlag Flag indicating if the cancellation is performed.
2169 /// \param CanceledDirective The kind of directive that is cancled.
2170 /// \param ExitCB Extra code to be generated in the exit block.
2171 ///
2172 /// \return an error, if any were triggered during execution.
2173 LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
2174 omp::Directive CanceledDirective,
2175 FinalizeCallbackTy ExitCB = {});
2176
2177 /// Generate a target region entry call.
2178 ///
2179 /// \param Loc The location at which the request originated and is fulfilled.
2180 /// \param AllocaIP The insertion point to be used for alloca instructions.
2181 /// \param Return Return value of the created function returned by reference.
2182 /// \param DeviceID Identifier for the device via the 'device' clause.
2183 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2184 /// or 0 if unspecified and -1 if there is no 'teams' clause.
2185 /// \param NumThreads Number of threads via the 'thread_limit' clause.
2186 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2187 /// \param KernelArgs Array of arguments to the kernel.
2188 LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2189 InsertPointTy AllocaIP,
2190 Value *&Return, Value *Ident,
2191 Value *DeviceID, Value *NumTeams,
2192 Value *NumThreads, Value *HostPtr,
2193 ArrayRef<Value *> KernelArgs);
2194
2195 /// Generate a flush runtime call.
2196 ///
2197 /// \param Loc The location at which the request originated and is fulfilled.
2198 LLVM_ABI void emitFlush(const LocationDescription &Loc);
2199
2200 /// The finalization stack made up of finalize callbacks currently in-flight,
2201 /// wrapped into FinalizationInfo objects that reference also the finalization
2202 /// target block and the kind of cancellable directive.
2203 SmallVector<FinalizationInfo, 8> FinalizationStack;
2204
2205 /// Return true if the last entry in the finalization stack is of kind \p DK
2206 /// and cancellable.
2207 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2208 return !FinalizationStack.empty() &&
2209 FinalizationStack.back().IsCancellable &&
2210 FinalizationStack.back().DK == DK;
2211 }
2212
2213 /// Generate a taskwait runtime call.
2214 ///
2215 /// \param Loc The location at which the request originated and is fulfilled.
2216 LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc);
2217
2218 /// Generate a taskyield runtime call.
2219 ///
2220 /// \param Loc The location at which the request originated and is fulfilled.
2221 LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc);
2222
2223 /// Return the current thread ID.
2224 ///
2225 /// \param Ident The ident (ident_t*) describing the query origin.
2226 LLVM_ABI Value *getOrCreateThreadID(Value *Ident);
2227
2228 /// The OpenMPIRBuilder Configuration
2229 OpenMPIRBuilderConfig Config;
2230
2231 /// The underlying LLVM-IR module
2232 Module &M;
2233
2234 /// The LLVM-IR Builder used to create IR.
2235 IRBuilder<> Builder;
2236
2237 /// Map to remember source location strings
2238 StringMap<Constant *> SrcLocStrMap;
2239
2240 /// Map to remember existing ident_t*.
2241 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2242
2243 /// Info manager to keep track of target regions.
2244 OffloadEntriesInfoManager OffloadInfoManager;
2245
2246 /// The target triple of the underlying module.
2247 const Triple T;
2248
2249 /// Helper that contains information about regions we need to outline
2250 /// during finalization.
2251 struct OutlineInfo {
2252 using PostOutlineCBTy = std::function<void(Function &)>;
2253 PostOutlineCBTy PostOutlineCB;
2254 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2255 SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2256
2257 /// Collect all blocks in between EntryBB and ExitBB in both the given
2258 /// vector and set.
2259 LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2260 SmallVectorImpl<BasicBlock *> &BlockVector);
2261
2262 /// Return the function that contains the region to be outlined.
2263 Function *getFunction() const { return EntryBB->getParent(); }
2264 };
2265
2266 /// Collection of regions that need to be outlined during finalization.
2267 SmallVector<OutlineInfo, 16> OutlineInfos;
2268
2269 /// A collection of candidate target functions that's constant allocas will
2270 /// attempt to be raised on a call of finalize after all currently enqueued
2271 /// outline info's have been processed.
2272 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2273
2274 /// Collection of owned canonical loop objects that eventually need to be
2275 /// free'd.
2276 std::forward_list<CanonicalLoopInfo> LoopInfos;
2277
2278 /// Collection of owned ScanInfo objects that eventually need to be free'd.
2279 std::forward_list<ScanInfo> ScanInfos;
2280
2281 /// Add a new region that will be outlined later.
2282 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2283
2284 /// An ordered map of auto-generated variables to their unique names.
2285 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2286 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2287 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2288 /// variables.
2289 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2290
2291 /// Computes the size of type in bytes.
2293
2294 // Emit a branch from the current block to the Target block only if
2295 // the current block has a terminator.
2296 LLVM_ABI void emitBranch(BasicBlock *Target);
2297
2298 // If BB has no use then delete it and return. Else place BB after the current
2299 // block, if possible, or else at the end of the function. Also add a branch
2300 // from current block to BB if current block does not have a terminator.
2301 LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn,
2302 bool IsFinished = false);
2303
2304 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2305 /// Here is the logic:
2306 /// if (Cond) {
2307 /// ThenGen();
2308 /// } else {
2309 /// ElseGen();
2310 /// }
2311 ///
2312 /// \return an error, if any were triggered during execution.
2313 LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2314 BodyGenCallbackTy ElseGen,
2315 InsertPointTy AllocaIP = {});
2316
2317 /// Create the global variable holding the offload mappings information.
2318 LLVM_ABI GlobalVariable *
2319 createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2320 std::string VarName);
2321
2322 /// Create the global variable holding the offload names information.
2323 LLVM_ABI GlobalVariable *
2324 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2325 std::string VarName);
2326
2327 struct MapperAllocas {
2328 AllocaInst *ArgsBase = nullptr;
2329 AllocaInst *Args = nullptr;
2330 AllocaInst *ArgSizes = nullptr;
2331 };
2332
2333 /// Create the allocas instruction used in call to mapper functions.
2334 LLVM_ABI void createMapperAllocas(const LocationDescription &Loc,
2335 InsertPointTy AllocaIP,
2336 unsigned NumOperands,
2337 struct MapperAllocas &MapperAllocas);
2338
2339 /// Create the call for the target mapper function.
2340 /// \param Loc The source location description.
2341 /// \param MapperFunc Function to be called.
2342 /// \param SrcLocInfo Source location information global.
2343 /// \param MaptypesArg The argument types.
2344 /// \param MapnamesArg The argument names.
2345 /// \param MapperAllocas The AllocaInst used for the call.
2346 /// \param DeviceID Device ID for the call.
2347 /// \param NumOperands Number of operands in the call.
2348 LLVM_ABI void emitMapperCall(const LocationDescription &Loc,
2349 Function *MapperFunc, Value *SrcLocInfo,
2350 Value *MaptypesArg, Value *MapnamesArg,
2351 struct MapperAllocas &MapperAllocas,
2352 int64_t DeviceID, unsigned NumOperands);
2353
2354 /// Container for the arguments used to pass data to the runtime library.
2355 struct TargetDataRTArgs {
2356 /// The array of base pointer passed to the runtime library.
2357 Value *BasePointersArray = nullptr;
2358 /// The array of section pointers passed to the runtime library.
2359 Value *PointersArray = nullptr;
2360 /// The array of sizes passed to the runtime library.
2361 Value *SizesArray = nullptr;
2362 /// The array of map types passed to the runtime library for the beginning
2363 /// of the region or for the entire region if there are no separate map
2364 /// types for the region end.
2365 Value *MapTypesArray = nullptr;
2366 /// The array of map types passed to the runtime library for the end of the
2367 /// region, or nullptr if there are no separate map types for the region
2368 /// end.
2369 Value *MapTypesArrayEnd = nullptr;
2370 /// The array of user-defined mappers passed to the runtime library.
2371 Value *MappersArray = nullptr;
2372 /// The array of original declaration names of mapped pointers sent to the
2373 /// runtime library for debugging
2374 Value *MapNamesArray = nullptr;
2375
2376 explicit TargetDataRTArgs() {}
2377 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2378 Value *SizesArray, Value *MapTypesArray,
2379 Value *MapTypesArrayEnd, Value *MappersArray,
2380 Value *MapNamesArray)
2381 : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2382 SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2383 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2384 MapNamesArray(MapNamesArray) {}
2385 };
2386
2387 /// Container to pass the default attributes with which a kernel must be
2388 /// launched, used to set kernel attributes and populate associated static
2389 /// structures.
2390 ///
2391 /// For max values, < 0 means unset, == 0 means set but unknown at compile
2392 /// time. The number of max values will be 1 except for the case where
2393 /// ompx_bare is set.
2394 struct TargetKernelDefaultAttrs {
2395 omp::OMPTgtExecModeFlags ExecFlags =
2396 omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2397 SmallVector<int32_t, 3> MaxTeams = {-1};
2398 int32_t MinTeams = 1;
2399 SmallVector<int32_t, 3> MaxThreads = {-1};
2400 int32_t MinThreads = 1;
2401 int32_t ReductionDataSize = 0;
2402 int32_t ReductionBufferLength = 0;
2403 };
2404
2405 /// Container to pass LLVM IR runtime values or constants related to the
2406 /// number of teams and threads with which the kernel must be launched, as
2407 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2408 /// kernel. These must be defined in the host prior to the call to the kernel
2409 /// launch OpenMP RTL function.
2410 struct TargetKernelRuntimeAttrs {
2411 SmallVector<Value *, 3> MaxTeams = {nullptr};
2412 Value *MinTeams = nullptr;
2413 SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
2414 SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
2415
2416 /// 'parallel' construct 'num_threads' clause value, if present and it is an
2417 /// SPMD kernel.
2418 Value *MaxThreads = nullptr;
2419
2420 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2421 /// it is a generic kernel.
2422 Value *LoopTripCount = nullptr;
2423 };
2424
2425 /// Data structure that contains the needed information to construct the
2426 /// kernel args vector.
2427 struct TargetKernelArgs {
2428 /// Number of arguments passed to the runtime library.
2429 unsigned NumTargetItems = 0;
2430 /// Arguments passed to the runtime library
2431 TargetDataRTArgs RTArgs;
2432 /// The number of iterations
2433 Value *NumIterations = nullptr;
2434 /// The number of teams.
2435 ArrayRef<Value *> NumTeams;
2436 /// The number of threads.
2437 ArrayRef<Value *> NumThreads;
2438 /// The size of the dynamic shared memory.
2439 Value *DynCGGroupMem = nullptr;
2440 /// True if the kernel has 'no wait' clause.
2441 bool HasNoWait = false;
2442
2443 // Constructors for TargetKernelArgs.
2444 TargetKernelArgs() {}
2445 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2446 Value *NumIterations, ArrayRef<Value *> NumTeams,
2447 ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
2448 bool HasNoWait)
2449 : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2450 NumIterations(NumIterations), NumTeams(NumTeams),
2451 NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
2452 HasNoWait(HasNoWait) {}
2453 };
2454
2455 /// Create the kernel args vector used by emitTargetKernel. This function
2456 /// creates various constant values that are used in the resulting args
2457 /// vector.
2458 LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2459 IRBuilderBase &Builder,
2460 SmallVector<Value *> &ArgsVector);
2461
2462 /// Struct that keeps the information that should be kept throughout
2463 /// a 'target data' region.
2464 class TargetDataInfo {
2465 /// Set to true if device pointer information have to be obtained.
2466 bool RequiresDevicePointerInfo = false;
2467 /// Set to true if Clang emits separate runtime calls for the beginning and
2468 /// end of the region. These calls might have separate map type arrays.
2469 bool SeparateBeginEndCalls = false;
2470
2471 public:
2472 TargetDataRTArgs RTArgs;
2473
2474 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2475 DevicePtrInfoMap;
2476
2477 /// Indicate whether any user-defined mapper exists.
2478 bool HasMapper = false;
2479 /// The total number of pointers passed to the runtime library.
2480 unsigned NumberOfPtrs = 0u;
2481
2482 bool EmitDebug = false;
2483
2484 /// Whether the `target ... data` directive has a `nowait` clause.
2485 bool HasNoWait = false;
2486
2487 explicit TargetDataInfo() {}
2488 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2489 bool SeparateBeginEndCalls)
2490 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2491 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2492 /// Clear information about the data arrays.
2493 void clearArrayInfo() {
2494 RTArgs = TargetDataRTArgs();
2495 HasMapper = false;
2496 NumberOfPtrs = 0u;
2497 }
2498 /// Return true if the current target data information has valid arrays.
2499 bool isValid() {
2500 return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2501 RTArgs.SizesArray && RTArgs.MapTypesArray &&
2502 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2503 }
2504 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2505 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2506 };
2507
2508 enum class DeviceInfoTy { None, Pointer, Address };
2509 using MapValuesArrayTy = SmallVector<Value *, 4>;
2510 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2511 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2512 using MapNamesArrayTy = SmallVector<Constant *, 4>;
2513 using MapDimArrayTy = SmallVector<uint64_t, 4>;
2514 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2515
2516 /// This structure contains combined information generated for mappable
2517 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2518 /// mappers, and non-contiguous information.
2519 struct MapInfosTy {
2520 struct StructNonContiguousInfo {
2521 bool IsNonContiguous = false;
2522 MapDimArrayTy Dims;
2523 MapNonContiguousArrayTy Offsets;
2524 MapNonContiguousArrayTy Counts;
2525 MapNonContiguousArrayTy Strides;
2526 };
2527 MapValuesArrayTy BasePointers;
2528 MapValuesArrayTy Pointers;
2529 MapDeviceInfoArrayTy DevicePointers;
2530 MapValuesArrayTy Sizes;
2531 MapFlagsArrayTy Types;
2532 MapNamesArrayTy Names;
2533 StructNonContiguousInfo NonContigInfo;
2534
2535 /// Append arrays in \a CurInfo.
2536 void append(MapInfosTy &CurInfo) {
2537 BasePointers.append(CurInfo.BasePointers.begin(),
2538 CurInfo.BasePointers.end());
2539 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2540 DevicePointers.append(CurInfo.DevicePointers.begin(),
2541 CurInfo.DevicePointers.end());
2542 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2543 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2544 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2545 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2546 CurInfo.NonContigInfo.Dims.end());
2547 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2548 CurInfo.NonContigInfo.Offsets.end());
2549 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2550 CurInfo.NonContigInfo.Counts.end());
2551 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2552 CurInfo.NonContigInfo.Strides.end());
2553 }
2554 };
2555 using MapInfosOrErrorTy = Expected<MapInfosTy &>;
2556
2557 /// Callback function type for functions emitting the host fallback code that
2558 /// is executed when the kernel launch fails. It takes an insertion point as
2559 /// parameter where the code should be emitted. It returns an insertion point
2560 /// that points right after after the emitted code.
2561 using EmitFallbackCallbackTy =
2562 function_ref<InsertPointOrErrorTy(InsertPointTy)>;
2563
2564 // Callback function type for emitting and fetching user defined custom
2565 // mappers.
2566 using CustomMapperCallbackTy =
2567 function_ref<Expected<Function *>(unsigned int)>;
2568
2569 /// Generate a target region entry call and host fallback call.
2570 ///
2571 /// \param Loc The location at which the request originated and is fulfilled.
2572 /// \param OutlinedFnID The ooulined function ID.
2573 /// \param EmitTargetCallFallbackCB Call back function to generate host
2574 /// fallback code.
2575 /// \param Args Data structure holding information about the kernel arguments.
2576 /// \param DeviceID Identifier for the device via the 'device' clause.
2577 /// \param RTLoc Source location identifier
2578 /// \param AllocaIP The insertion point to be used for alloca instructions.
2579 LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(
2580 const LocationDescription &Loc, Value *OutlinedFnID,
2581 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2582 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2583
2584 /// Callback type for generating the bodies of device directives that require
2585 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2586 ///
2587 /// \param DeviceID The ID of the device on which the target region will
2588 /// execute.
2589 /// \param RTLoc Source location identifier
2590 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2591 /// generated task.
2592 ///
2593 /// \return an error, if any were triggered during execution.
2594 using TargetTaskBodyCallbackTy =
2595 function_ref<Error(Value *DeviceID, Value *RTLoc,
2596 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2597
2598 /// Generate a target-task for the target construct
2599 ///
2600 /// \param TaskBodyCB Callback to generate the actual body of the target task.
2601 /// \param DeviceID Identifier for the device via the 'device' clause.
2602 /// \param RTLoc Source location identifier
2603 /// \param AllocaIP The insertion point to be used for alloca instructions.
2604 /// \param Dependencies Vector of DependData objects holding information of
2605 /// dependencies as specified by the 'depend' clause.
2606 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2607 /// otherwise
2608 LLVM_ABI InsertPointOrErrorTy emitTargetTask(
2609 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2610 OpenMPIRBuilder::InsertPointTy AllocaIP,
2611 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
2612 const TargetDataRTArgs &RTArgs, bool HasNoWait);
2613
2614 /// Emit the arguments to be passed to the runtime library based on the
2615 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2616 /// ForEndCall, emit map types to be passed for the end of the region instead
2617 /// of the beginning.
2618 LLVM_ABI void emitOffloadingArraysArgument(
2619 IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2620 OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false);
2621
2622 /// Emit an array of struct descriptors to be assigned to the offload args.
2623 LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2624 InsertPointTy CodeGenIP,
2625 MapInfosTy &CombinedInfo,
2626 TargetDataInfo &Info);
2627
2628 /// Emit the arrays used to pass the captures and map information to the
2629 /// offloading runtime library. If there is no map or capture information,
2630 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2631 /// that contains information generated for mappable clauses,
2632 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2633 LLVM_ABI Error emitOffloadingArrays(
2634 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2635 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
2636 bool IsNonContiguous = false,
2637 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2638
2639 /// Allocates memory for and populates the arrays required for offloading
2640 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2641 /// emits their base addresses as arguments to be passed to the runtime
2642 /// library. In essence, this function is a combination of
2643 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2644 /// be preferred by clients of OpenMPIRBuilder.
2645 LLVM_ABI Error emitOffloadingArraysAndArgs(
2646 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2647 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2648 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
2649 bool ForEndCall = false,
2650 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2651
2652 /// Creates offloading entry for the provided entry ID \a ID, address \a
2653 /// Addr, size \a Size, and flags \a Flags.
2654 LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2655 int32_t Flags, GlobalValue::LinkageTypes,
2656 StringRef Name = "");
2657
2658 /// The kind of errors that can occur when emitting the offload entries and
2659 /// metadata.
2660 enum EmitMetadataErrorKind {
2661 EMIT_MD_TARGET_REGION_ERROR,
2662 EMIT_MD_DECLARE_TARGET_ERROR,
2663 EMIT_MD_GLOBAL_VAR_LINK_ERROR
2664 };
2665
2666 /// Callback function type
2667 using EmitMetadataErrorReportFunctionTy =
2668 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2669
2670 // Emit the offloading entries and metadata so that the device codegen side
2671 // can easily figure out what to emit. The produced metadata looks like
2672 // this:
2673 //
2674 // !omp_offload.info = !{!1, ...}
2675 //
2676 // We only generate metadata for function that contain target regions.
2677 LLVM_ABI void createOffloadEntriesAndInfoMetadata(
2678 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2679
2680public:
2681 /// Generator for __kmpc_copyprivate
2682 ///
2683 /// \param Loc The source location description.
2684 /// \param BufSize Number of elements in the buffer.
2685 /// \param CpyBuf List of pointers to data to be copied.
2686 /// \param CpyFn function to call for copying data.
2687 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2688 ///
2689 /// \return The insertion position *after* the CopyPrivate call.
2690
2691 LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2692 llvm::Value *BufSize,
2693 llvm::Value *CpyBuf,
2694 llvm::Value *CpyFn,
2695 llvm::Value *DidIt);
2696
2697 /// Generator for '#omp single'
2698 ///
2699 /// \param Loc The source location description.
2700 /// \param BodyGenCB Callback that will generate the region code.
2701 /// \param FiniCB Callback to finalize variable copies.
2702 /// \param IsNowait If false, a barrier is emitted.
2703 /// \param CPVars copyprivate variables.
2704 /// \param CPFuncs copy functions to use for each copyprivate variable.
2705 ///
2706 /// \returns The insertion position *after* the single call.
2707 LLVM_ABI InsertPointOrErrorTy
2708 createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2709 FinalizeCallbackTy FiniCB, bool IsNowait,
2710 ArrayRef<llvm::Value *> CPVars = {},
2711 ArrayRef<llvm::Function *> CPFuncs = {});
2712
2713 /// Generator for '#omp master'
2714 ///
2715 /// \param Loc The insert and source location description.
2716 /// \param BodyGenCB Callback that will generate the region code.
2717 /// \param FiniCB Callback to finalize variable copies.
2718 ///
2719 /// \returns The insertion position *after* the master.
2720 LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2721 BodyGenCallbackTy BodyGenCB,
2722 FinalizeCallbackTy FiniCB);
2723
2724 /// Generator for '#omp masked'
2725 ///
2726 /// \param Loc The insert and source location description.
2727 /// \param BodyGenCB Callback that will generate the region code.
2728 /// \param FiniCB Callback to finialize variable copies.
2729 ///
2730 /// \returns The insertion position *after* the masked.
2731 LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2732 BodyGenCallbackTy BodyGenCB,
2733 FinalizeCallbackTy FiniCB,
2734 Value *Filter);
2735
2736 /// This function performs the scan reduction of the values updated in
2737 /// the input phase. The reduction logic needs to be emitted between input
2738 /// and scan loop returned by `CreateCanonicalScanLoops`. The following
2739 /// is the code that is generated, `buffer` and `span` are expected to be
2740 /// populated before executing the generated code.
2741 /// \code{c}
2742 /// for (int k = 0; k != ceil(log2(span)); ++k) {
2743 /// i=pow(2,k)
2744 /// for (size cnt = last_iter; cnt >= i; --cnt)
2745 /// buffer[cnt] op= buffer[cnt-i];
2746 /// }
2747 /// \endcode
2748 /// \param Loc The insert and source location description.
2749 /// \param ReductionInfos Array type containing the ReductionOps.
2750 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2751 /// `ScanInfoInitialize`.
2752 ///
2753 /// \returns The insertion position *after* the masked.
2754 LLVM_ABI InsertPointOrErrorTy emitScanReduction(
2755 const LocationDescription &Loc,
2756 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
2757 ScanInfo *ScanRedInfo);
2758
2759 /// This directive split and directs the control flow to input phase
2760 /// blocks or scan phase blocks based on 1. whether input loop or scan loop
2761 /// is executed, 2. whether exclusive or inclusive scan is used.
2762 ///
2763 /// \param Loc The insert and source location description.
2764 /// \param AllocaIP The IP where the temporary buffer for scan reduction
2765 // needs to be allocated.
2766 /// \param ScanVars Scan Variables.
2767 /// \param IsInclusive Whether it is an inclusive or exclusive scan.
2768 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2769 /// `ScanInfoInitialize`.
2770 ///
2771 /// \returns The insertion position *after* the scan.
2772 LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc,
2773 InsertPointTy AllocaIP,
2774 ArrayRef<llvm::Value *> ScanVars,
2775 ArrayRef<llvm::Type *> ScanVarsType,
2776 bool IsInclusive,
2777 ScanInfo *ScanRedInfo);
2778
2779 /// Generator for '#omp critical'
2780 ///
2781 /// \param Loc The insert and source location description.
2782 /// \param BodyGenCB Callback that will generate the region body code.
2783 /// \param FiniCB Callback to finalize variable copies.
2784 /// \param CriticalName name of the lock used by the critical directive
2785 /// \param HintInst Hint Instruction for hint clause associated with critical
2786 ///
2787 /// \returns The insertion position *after* the critical.
2788 LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2789 BodyGenCallbackTy BodyGenCB,
2790 FinalizeCallbackTy FiniCB,
2791 StringRef CriticalName,
2792 Value *HintInst);
2793
2794 /// Generator for '#omp ordered depend (source | sink)'
2795 ///
2796 /// \param Loc The insert and source location description.
2797 /// \param AllocaIP The insertion point to be used for alloca instructions.
2798 /// \param NumLoops The number of loops in depend clause.
2799 /// \param StoreValues The value will be stored in vector address.
2800 /// \param Name The name of alloca instruction.
2801 /// \param IsDependSource If true, depend source; otherwise, depend sink.
2802 ///
2803 /// \return The insertion position *after* the ordered.
2804 LLVM_ABI InsertPointTy
2805 createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP,
2806 unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues,
2807 const Twine &Name, bool IsDependSource);
2808
2809 /// Generator for '#omp ordered [threads | simd]'
2810 ///
2811 /// \param Loc The insert and source location description.
2812 /// \param BodyGenCB Callback that will generate the region code.
2813 /// \param FiniCB Callback to finalize variable copies.
2814 /// \param IsThreads If true, with threads clause or without clause;
2815 /// otherwise, with simd clause;
2816 ///
2817 /// \returns The insertion position *after* the ordered.
2818 LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(
2819 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2820 FinalizeCallbackTy FiniCB, bool IsThreads);
2821
2822 /// Generator for '#omp sections'
2823 ///
2824 /// \param Loc The insert and source location description.
2825 /// \param AllocaIP The insertion points to be used for alloca instructions.
2826 /// \param SectionCBs Callbacks that will generate body of each section.
2827 /// \param PrivCB Callback to copy a given variable (think copy constructor).
2828 /// \param FiniCB Callback to finalize variable copies.
2829 /// \param IsCancellable Flag to indicate a cancellable parallel region.
2830 /// \param IsNowait If true, barrier - to ensure all sections are executed
2831 /// before moving forward will not be generated.
2832 /// \returns The insertion position *after* the sections.
2833 LLVM_ABI InsertPointOrErrorTy
2834 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
2835 ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2836 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
2837 bool IsCancellable, bool IsNowait);
2838
2839 /// Generator for '#omp section'
2840 ///
2841 /// \param Loc The insert and source location description.
2842 /// \param BodyGenCB Callback that will generate the region body code.
2843 /// \param FiniCB Callback to finalize variable copies.
2844 /// \returns The insertion position *after* the section.
2845 LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc,
2846 BodyGenCallbackTy BodyGenCB,
2847 FinalizeCallbackTy FiniCB);
2848
2849 /// Generator for `#omp teams`
2850 ///
2851 /// \param Loc The location where the teams construct was encountered.
2852 /// \param BodyGenCB Callback that will generate the region code.
2853 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2854 /// it is as if lower bound is specified as equal to upperbound. If
2855 /// this is non-null, then upperbound must also be non-null.
2856 /// \param NumTeamsUpper Upper bound on the number of teams.
2857 /// \param ThreadLimit on the number of threads that may participate in a
2858 /// contention group created by each team.
2859 /// \param IfExpr is the integer argument value of the if condition on the
2860 /// teams clause.
2861 LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc,
2862 BodyGenCallbackTy BodyGenCB,
2863 Value *NumTeamsLower = nullptr,
2864 Value *NumTeamsUpper = nullptr,
2865 Value *ThreadLimit = nullptr,
2866 Value *IfExpr = nullptr);
2867
2868 /// Generator for `#omp distribute`
2869 ///
2870 /// \param Loc The location where the distribute construct was encountered.
2871 /// \param AllocaIP The insertion points to be used for alloca instructions.
2872 /// \param BodyGenCB Callback that will generate the region code.
2873 LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
2874 InsertPointTy AllocaIP,
2875 BodyGenCallbackTy BodyGenCB);
2876
2877 /// Generate conditional branch and relevant BasicBlocks through which private
2878 /// threads copy the 'copyin' variables from Master copy to threadprivate
2879 /// copies.
2880 ///
2881 /// \param IP insertion block for copyin conditional
2882 /// \param MasterVarPtr a pointer to the master variable
2883 /// \param PrivateVarPtr a pointer to the threadprivate variable
2884 /// \param IntPtrTy Pointer size type
2885 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2886 // and copy.in.end block
2887 ///
2888 /// \returns The insertion point where copying operation to be emitted.
2889 LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP,
2890 Value *MasterAddr,
2891 Value *PrivateAddr,
2892 llvm::IntegerType *IntPtrTy,
2893 bool BranchtoEnd = true);
2894
2895 /// Create a runtime call for kmpc_Alloc
2896 ///
2897 /// \param Loc The insert and source location description.
2898 /// \param Size Size of allocated memory space
2899 /// \param Allocator Allocator information instruction
2900 /// \param Name Name of call Instruction for OMP_alloc
2901 ///
2902 /// \returns CallInst to the OMP_Alloc call
2903 LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2904 Value *Allocator, std::string Name = "");
2905
2906 /// Create a runtime call for kmpc_free
2907 ///
2908 /// \param Loc The insert and source location description.
2909 /// \param Addr Address of memory space to be freed
2910 /// \param Allocator Allocator information instruction
2911 /// \param Name Name of call Instruction for OMP_Free
2912 ///
2913 /// \returns CallInst to the OMP_Free call
2914 LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2915 Value *Allocator, std::string Name = "");
2916
2917 /// Create a runtime call for kmpc_threadprivate_cached
2918 ///
2919 /// \param Loc The insert and source location description.
2920 /// \param Pointer pointer to data to be cached
2921 /// \param Size size of data to be cached
2922 /// \param Name Name of call Instruction for callinst
2923 ///
2924 /// \returns CallInst to the thread private cache call.
2925 LLVM_ABI CallInst *
2926 createCachedThreadPrivate(const LocationDescription &Loc,
2927 llvm::Value *Pointer, llvm::ConstantInt *Size,
2928 const llvm::Twine &Name = Twine(""));
2929
2930 /// Create a runtime call for __tgt_interop_init
2931 ///
2932 /// \param Loc The insert and source location description.
2933 /// \param InteropVar variable to be allocated
2934 /// \param InteropType type of interop operation
2935 /// \param Device devide to which offloading will occur
2936 /// \param NumDependences number of dependence variables
2937 /// \param DependenceAddress pointer to dependence variables
2938 /// \param HaveNowaitClause does nowait clause exist
2939 ///
2940 /// \returns CallInst to the __tgt_interop_init call
2941 LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc,
2942 Value *InteropVar,
2943 omp::OMPInteropType InteropType,
2944 Value *Device, Value *NumDependences,
2945 Value *DependenceAddress,
2946 bool HaveNowaitClause);
2947
2948 /// Create a runtime call for __tgt_interop_destroy
2949 ///
2950 /// \param Loc The insert and source location description.
2951 /// \param InteropVar variable to be allocated
2952 /// \param Device devide to which offloading will occur
2953 /// \param NumDependences number of dependence variables
2954 /// \param DependenceAddress pointer to dependence variables
2955 /// \param HaveNowaitClause does nowait clause exist
2956 ///
2957 /// \returns CallInst to the __tgt_interop_destroy call
2958 LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2959 Value *InteropVar, Value *Device,
2960 Value *NumDependences,
2961 Value *DependenceAddress,
2962 bool HaveNowaitClause);
2963
2964 /// Create a runtime call for __tgt_interop_use
2965 ///
2966 /// \param Loc The insert and source location description.
2967 /// \param InteropVar variable to be allocated
2968 /// \param Device devide to which offloading will occur
2969 /// \param NumDependences number of dependence variables
2970 /// \param DependenceAddress pointer to dependence variables
2971 /// \param HaveNowaitClause does nowait clause exist
2972 ///
2973 /// \returns CallInst to the __tgt_interop_use call
2974 LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc,
2975 Value *InteropVar, Value *Device,
2976 Value *NumDependences,
2977 Value *DependenceAddress,
2978 bool HaveNowaitClause);
2979
2980 /// The `omp target` interface
2981 ///
2982 /// For more information about the usage of this interface,
2983 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2984 ///
2985 ///{
2986
2987 /// Create a runtime call for kmpc_target_init
2988 ///
2989 /// \param Loc The insert and source location description.
2990 /// \param Attrs Structure containing the default attributes, including
2991 /// numbers of threads and teams to launch the kernel with.
2992 LLVM_ABI InsertPointTy createTargetInit(
2993 const LocationDescription &Loc,
2994 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
2995
2996 /// Create a runtime call for kmpc_target_deinit
2997 ///
2998 /// \param Loc The insert and source location description.
2999 /// \param TeamsReductionDataSize The maximal size of all the reduction data
3000 /// for teams reduction.
3001 /// \param TeamsReductionBufferLength The number of elements (each of up to
3002 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
3003 LLVM_ABI void createTargetDeinit(const LocationDescription &Loc,
3004 int32_t TeamsReductionDataSize = 0,
3005 int32_t TeamsReductionBufferLength = 1024);
3006
3007 ///}
3008
3009 /// Helpers to read/write kernel annotations from the IR.
3010 ///
3011 ///{
3012
3013 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
3014 /// is set.
3015 LLVM_ABI static std::pair<int32_t, int32_t>
3016 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
3017 LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T,
3018 Function &Kernel, int32_t LB,
3019 int32_t UB);
3020
3021 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
3022 /// is set.
3023 LLVM_ABI static std::pair<int32_t, int32_t>
3024 readTeamBoundsForKernel(const Triple &T, Function &Kernel);
3025 LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel,
3026 int32_t LB, int32_t UB);
3027 ///}
3028
3029private:
3030 // Sets the function attributes expected for the outlined function
3031 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
3032
3033 // Creates the function ID/Address for the given outlined function.
3034 // In the case of an embedded device function the address of the function is
3035 // used, in the case of a non-offload function a constant is created.
3036 Constant *createOutlinedFunctionID(Function *OutlinedFn,
3037 StringRef EntryFnIDName);
3038
3039 // Creates the region entry address for the outlined function
3040 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
3041 StringRef EntryFnName);
3042
3043public:
3044 /// Functions used to generate a function with the given name.
3045 using FunctionGenCallback =
3046 std::function<Expected<Function *>(StringRef FunctionName)>;
3047
3048 /// Create a unique name for the entry function using the source location
3049 /// information of the current target region. The name will be something like:
3050 ///
3051 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
3052 ///
3053 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
3054 /// mangled name of the function that encloses the target region and BB is the
3055 /// line number of the target region. CC is a count added when more than one
3056 /// region is located at the same location.
3057 ///
3058 /// If this target outline function is not an offload entry, we don't need to
3059 /// register it. This may happen if it is guarded by an if clause that is
3060 /// false at compile time, or no target archs have been specified.
3061 ///
3062 /// The created target region ID is used by the runtime library to identify
3063 /// the current target region, so it only has to be unique and not
3064 /// necessarily point to anything. It could be the pointer to the outlined
3065 /// function that implements the target region, but we aren't using that so
3066 /// that the compiler doesn't need to keep that, and could therefore inline
3067 /// the host function if proven worthwhile during optimization. In the other
3068 /// hand, if emitting code for the device, the ID has to be the function
3069 /// address so that it can retrieved from the offloading entry and launched
3070 /// by the runtime library. We also mark the outlined function to have
3071 /// external linkage in case we are emitting code for the device, because
3072 /// these functions will be entry points to the device.
3073 ///
3074 /// \param InfoManager The info manager keeping track of the offload entries
3075 /// \param EntryInfo The entry information about the function
3076 /// \param GenerateFunctionCallback The callback function to generate the code
3077 /// \param OutlinedFunction Pointer to the outlined function
3078 /// \param EntryFnIDName Name of the ID o be created
3079 LLVM_ABI Error emitTargetRegionFunction(
3080 TargetRegionEntryInfo &EntryInfo,
3081 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
3082 Function *&OutlinedFn, Constant *&OutlinedFnID);
3083
3084 /// Registers the given function and sets up the attribtues of the function
3085 /// Returns the FunctionID.
3086 ///
3087 /// \param InfoManager The info manager keeping track of the offload entries
3088 /// \param EntryInfo The entry information about the function
3089 /// \param OutlinedFunction Pointer to the outlined function
3090 /// \param EntryFnName Name of the outlined function
3091 /// \param EntryFnIDName Name of the ID o be created
3093 registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
3094 Function *OutlinedFunction,
3095 StringRef EntryFnName, StringRef EntryFnIDName);
3096
3097 /// Type of BodyGen to use for region codegen
3098 ///
3099 /// Priv: If device pointer privatization is required, emit the body of the
3100 /// region here. It will have to be duplicated: with and without
3101 /// privatization.
3102 /// DupNoPriv: If we need device pointer privatization, we need
3103 /// to emit the body of the region with no privatization in the 'else' branch
3104 /// of the conditional.
3105 /// NoPriv: If we don't require privatization of device
3106 /// pointers, we emit the body in between the runtime calls. This avoids
3107 /// duplicating the body code.
3108 enum BodyGenTy { Priv, DupNoPriv, NoPriv };
3109
3110 /// Callback type for creating the map infos for the kernel parameters.
3111 /// \param CodeGenIP is the insertion point where code should be generated,
3112 /// if any.
3113 using GenMapInfoCallbackTy =
3114 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
3115
3116private:
3117 /// Emit the array initialization or deletion portion for user-defined mapper
3118 /// code generation. First, it evaluates whether an array section is mapped
3119 /// and whether the \a MapType instructs to delete this section. If \a IsInit
3120 /// is true, and \a MapType indicates to not delete this array, array
3121 /// initialization code is generated. If \a IsInit is false, and \a MapType
3122 /// indicates to delete this array, array deletion code is generated.
3123 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
3124 llvm::Value *Base, llvm::Value *Begin,
3125 llvm::Value *Size, llvm::Value *MapType,
3126 llvm::Value *MapName, TypeSize ElementSize,
3127 llvm::BasicBlock *ExitBB, bool IsInit);
3128
3129public:
3130 /// Emit the user-defined mapper function. The code generation follows the
3131 /// pattern in the example below.
3132 /// \code
3133 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
3134 /// void *base, void *begin,
3135 /// int64_t size, int64_t type,
3136 /// void *name = nullptr) {
3137 /// // Allocate space for an array section first or add a base/begin for
3138 /// // pointer dereference.
3139 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
3140 /// !maptype.IsDelete)
3141 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3142 /// size*sizeof(Ty), clearToFromMember(type));
3143 /// // Map members.
3144 /// for (unsigned i = 0; i < size; i++) {
3145 /// // For each component specified by this mapper:
3146 /// for (auto c : begin[i]->all_components) {
3147 /// if (c.hasMapper())
3148 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
3149 /// c.arg_size,
3150 /// c.arg_type, c.arg_name);
3151 /// else
3152 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
3153 /// c.arg_begin, c.arg_size, c.arg_type,
3154 /// c.arg_name);
3155 /// }
3156 /// }
3157 /// // Delete the array section.
3158 /// if (size > 1 && maptype.IsDelete)
3159 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3160 /// size*sizeof(Ty), clearToFromMember(type));
3161 /// }
3162 /// \endcode
3163 ///
3164 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
3165 /// MapInfos and returns.
3166 /// \param ElemTy DeclareMapper element type.
3167 /// \param FuncName Optional param to specify mapper function name.
3168 /// \param CustomMapperCB Optional callback to generate code related to
3169 /// custom mappers.
3170 LLVM_ABI Expected<Function *> emitUserDefinedMapper(
3171 function_ref<MapInfosOrErrorTy(
3172 InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
3173 PrivAndGenMapInfoCB,
3174 llvm::Type *ElemTy, StringRef FuncName,
3175 CustomMapperCallbackTy CustomMapperCB);
3176
3177 /// Generator for '#omp target data'
3178 ///
3179 /// \param Loc The location where the target data construct was encountered.
3180 /// \param AllocaIP The insertion points to be used for alloca instructions.
3181 /// \param CodeGenIP The insertion point at which the target directive code
3182 /// should be placed.
3183 /// \param IsBegin If true then emits begin mapper call otherwise emits
3184 /// end mapper call.
3185 /// \param DeviceID Stores the DeviceID from the device clause.
3186 /// \param IfCond Value which corresponds to the if clause condition.
3187 /// \param Info Stores all information realted to the Target Data directive.
3188 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
3189 /// \param CustomMapperCB Callback to generate code related to
3190 /// custom mappers.
3191 /// \param BodyGenCB Optional Callback to generate the region code.
3192 /// \param DeviceAddrCB Optional callback to generate code related to
3193 /// use_device_ptr and use_device_addr.
3194 LLVM_ABI InsertPointOrErrorTy createTargetData(
3195 const LocationDescription &Loc, InsertPointTy AllocaIP,
3196 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
3197 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
3198 CustomMapperCallbackTy CustomMapperCB,
3199 omp::RuntimeFunction *MapperFunc = nullptr,
3200 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
3201 BodyGenTy BodyGenType)>
3202 BodyGenCB = nullptr,
3203 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
3204 Value *SrcLocInfo = nullptr);
3205
3206 using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
3207 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
3208
3209 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
3210 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
3211 InsertPointTy CodeGenIP)>;
3212
3213 /// Generator for '#omp target'
3214 ///
3215 /// \param Loc where the target data construct was encountered.
3216 /// \param IsOffloadEntry whether it is an offload entry.
3217 /// \param CodeGenIP The insertion point where the call to the outlined
3218 /// function should be emitted.
3219 /// \param Info Stores all information realted to the Target directive.
3220 /// \param EntryInfo The entry information about the function.
3221 /// \param DefaultAttrs Structure containing the default attributes, including
3222 /// numbers of threads and teams to launch the kernel with.
3223 /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3224 /// and teams to launch the kernel with.
3225 /// \param IfCond value of the `if` clause.
3226 /// \param Inputs The input values to the region that will be passed.
3227 /// as arguments to the outlined function.
3228 /// \param BodyGenCB Callback that will generate the region code.
3229 /// \param ArgAccessorFuncCB Callback that will generate accessors
3230 /// instructions for passed in target arguments where neccessary
3231 /// \param CustomMapperCB Callback to generate code related to
3232 /// custom mappers.
3233 /// \param Dependencies A vector of DependData objects that carry
3234 /// dependency information as passed in the depend clause
3235 /// \param HasNowait Whether the target construct has a `nowait` clause or
3236 /// not.
3237 LLVM_ABI InsertPointOrErrorTy createTarget(
3238 const LocationDescription &Loc, bool IsOffloadEntry,
3239 OpenMPIRBuilder::InsertPointTy AllocaIP,
3240 OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
3241 TargetRegionEntryInfo &EntryInfo,
3242 const TargetKernelDefaultAttrs &DefaultAttrs,
3243 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3244 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3245 TargetBodyGenCallbackTy BodyGenCB,
3246 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3247 CustomMapperCallbackTy CustomMapperCB,
3248 const SmallVector<DependData> &Dependencies, bool HasNowait = false);
3249
3250 /// Returns __kmpc_for_static_init_* runtime function for the specified
3251 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3252 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3253 LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize,
3254 bool IVSigned,
3255 bool IsGPUDistribute);
3256
3257 /// Returns __kmpc_dispatch_init_* runtime function for the specified
3258 /// size \a IVSize and sign \a IVSigned.
3259 LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize,
3260 bool IVSigned);
3261
3262 /// Returns __kmpc_dispatch_next_* runtime function for the specified
3263 /// size \a IVSize and sign \a IVSigned.
3264 LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize,
3265 bool IVSigned);
3266
3267 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3268 /// size \a IVSize and sign \a IVSigned.
3269 LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize,
3270 bool IVSigned);
3271
3272 /// Returns __kmpc_dispatch_deinit runtime function.
3273 LLVM_ABI FunctionCallee createDispatchDeinitFunction();
3274
3275 /// Declarations for LLVM-IR types (simple, array, function and structure) are
3276 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3277 /// we provide the declarations, the initializeTypes function will provide the
3278 /// values.
3279 ///
3280 ///{
3281#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3282#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
3283 ArrayType *VarName##Ty = nullptr; \
3284 PointerType *VarName##PtrTy = nullptr;
3285#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
3286 FunctionType *VarName = nullptr; \
3287 PointerType *VarName##Ptr = nullptr;
3288#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
3289 StructType *VarName = nullptr; \
3290 PointerType *VarName##Ptr = nullptr;
3291#include "llvm/Frontend/OpenMP/OMPKinds.def"
3292
3293 ///}
3294
3295private:
3296 /// Create all simple and struct types exposed by the runtime and remember
3297 /// the llvm::PointerTypes of them for easy access later.
3298 void initializeTypes(Module &M);
3299
3300 /// Common interface for generating entry calls for OMP Directives.
3301 /// if the directive has a region/body, It will set the insertion
3302 /// point to the body
3303 ///
3304 /// \param OMPD Directive to generate entry blocks for
3305 /// \param EntryCall Call to the entry OMP Runtime Function
3306 /// \param ExitBB block where the region ends.
3307 /// \param Conditional indicate if the entry call result will be used
3308 /// to evaluate a conditional of whether a thread will execute
3309 /// body code or not.
3310 ///
3311 /// \return The insertion position in exit block
3312 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3313 BasicBlock *ExitBB,
3314 bool Conditional = false);
3315
3316 /// Common interface to finalize the region
3317 ///
3318 /// \param OMPD Directive to generate exiting code for
3319 /// \param FinIP Insertion point for emitting Finalization code and exit call
3320 /// \param ExitCall Call to the ending OMP Runtime Function
3321 /// \param HasFinalize indicate if the directive will require finalization
3322 /// and has a finalization callback in the stack that
3323 /// should be called.
3324 ///
3325 /// \return The insertion position in exit block
3326 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3327 InsertPointTy FinIP,
3328 Instruction *ExitCall,
3329 bool HasFinalize = true);
3330
3331 /// Common Interface to generate OMP inlined regions
3332 ///
3333 /// \param OMPD Directive to generate inlined region for
3334 /// \param EntryCall Call to the entry OMP Runtime Function
3335 /// \param ExitCall Call to the ending OMP Runtime Function
3336 /// \param BodyGenCB Body code generation callback.
3337 /// \param FiniCB Finalization Callback. Will be called when finalizing region
3338 /// \param Conditional indicate if the entry call result will be used
3339 /// to evaluate a conditional of whether a thread will execute
3340 /// body code or not.
3341 /// \param HasFinalize indicate if the directive will require finalization
3342 /// and has a finalization callback in the stack that
3343 /// should be called.
3344 /// \param IsCancellable if HasFinalize is set to true, indicate if the
3345 /// the directive should be cancellable.
3346 /// \return The insertion point after the region
3347 InsertPointOrErrorTy
3348 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3349 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3350 FinalizeCallbackTy FiniCB, bool Conditional = false,
3351 bool HasFinalize = true, bool IsCancellable = false);
3352
3353 /// Get the platform-specific name separator.
3354 /// \param Parts different parts of the final name that needs separation
3355 /// \param FirstSeparator First separator used between the initial two
3356 /// parts of the name.
3357 /// \param Separator separator used between all of the rest consecutive
3358 /// parts of the name
3359 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3360 StringRef FirstSeparator,
3361 StringRef Separator);
3362
3363 /// Returns corresponding lock object for the specified critical region
3364 /// name. If the lock object does not exist it is created, otherwise the
3365 /// reference to the existing copy is returned.
3366 /// \param CriticalName Name of the critical region.
3367 ///
3368 Value *getOMPCriticalRegionLock(StringRef CriticalName);
3369
3370 /// Callback type for Atomic Expression update
3371 /// ex:
3372 /// \code{.cpp}
3373 /// unsigned x = 0;
3374 /// #pragma omp atomic update
3375 /// x = Expr(x_old); //Expr() is any legal operation
3376 /// \endcode
3377 ///
3378 /// \param XOld the value of the atomic memory address to use for update
3379 /// \param IRB reference to the IRBuilder to use
3380 ///
3381 /// \returns Value to update X to.
3382 using AtomicUpdateCallbackTy =
3383 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3384
3385private:
3386 enum AtomicKind { Read, Write, Update, Capture, Compare };
3387
3388 /// Determine whether to emit flush or not
3389 ///
3390 /// \param Loc The insert and source location description.
3391 /// \param AO The required atomic ordering
3392 /// \param AK The OpenMP atomic operation kind used.
3393 ///
3394 /// \returns wether a flush was emitted or not
3395 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3396 AtomicOrdering AO, AtomicKind AK);
3397
3398 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3399 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3400 /// Only Scalar data types.
3401 ///
3402 /// \param AllocaIP The insertion point to be used for alloca
3403 /// instructions.
3404 /// \param X The target atomic pointer to be updated
3405 /// \param XElemTy The element type of the atomic pointer.
3406 /// \param Expr The value to update X with.
3407 /// \param AO Atomic ordering of the generated atomic
3408 /// instructions.
3409 /// \param RMWOp The binary operation used for update. If
3410 /// operation is not supported by atomicRMW,
3411 /// or belong to {FADD, FSUB, BAD_BINOP}.
3412 /// Then a `cmpExch` based atomic will be generated.
3413 /// \param UpdateOp Code generator for complex expressions that cannot be
3414 /// expressed through atomicrmw instruction.
3415 /// \param VolatileX true if \a X volatile?
3416 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3417 /// update expression, false otherwise.
3418 /// (e.g. true for X = X BinOp Expr)
3419 ///
3420 /// \returns A pair of the old value of X before the update, and the value
3421 /// used for the update.
3422 Expected<std::pair<Value *, Value *>>
3423 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3424 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3425 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3426 bool IsXBinopExpr, bool IsIgnoreDenormalMode,
3427 bool IsFineGrainedMemory, bool IsRemoteMemory);
3428
3429 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3430 ///
3431 /// \Return The instruction
3432 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3433 AtomicRMWInst::BinOp RMWOp);
3434
3435 bool IsFinalized;
3436
3437public:
3438 /// a struct to pack relevant information while generating atomic Ops
3439 struct AtomicOpValue {
3440 Value *Var = nullptr;
3441 Type *ElemTy = nullptr;
3442 bool IsSigned = false;
3443 bool IsVolatile = false;
3444 };
3445
3446 /// Emit atomic Read for : V = X --- Only Scalar data types.
3447 ///
3448 /// \param Loc The insert and source location description.
3449 /// \param X The target pointer to be atomically read
3450 /// \param V Memory address where to store atomically read
3451 /// value
3452 /// \param AO Atomic ordering of the generated atomic
3453 /// instructions.
3454 /// \param AllocaIP Insert point for allocas
3455 //
3456 /// \return Insertion point after generated atomic read IR.
3457 LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc,
3458 AtomicOpValue &X, AtomicOpValue &V,
3459 AtomicOrdering AO,
3460 InsertPointTy AllocaIP);
3461
3462 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3463 ///
3464 /// \param Loc The insert and source location description.
3465 /// \param X The target pointer to be atomically written to
3466 /// \param Expr The value to store.
3467 /// \param AO Atomic ordering of the generated atomic
3468 /// instructions.
3469 /// \param AllocaIP Insert point for allocas
3470 ///
3471 /// \return Insertion point after generated atomic Write IR.
3472 LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3473 AtomicOpValue &X, Value *Expr,
3474 AtomicOrdering AO,
3475 InsertPointTy AllocaIP);
3476
3477 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3478 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3479 /// Only Scalar data types.
3480 ///
3481 /// \param Loc The insert and source location description.
3482 /// \param AllocaIP The insertion point to be used for alloca instructions.
3483 /// \param X The target atomic pointer to be updated
3484 /// \param Expr The value to update X with.
3485 /// \param AO Atomic ordering of the generated atomic instructions.
3486 /// \param RMWOp The binary operation used for update. If operation
3487 /// is not supported by atomicRMW, or belong to
3488 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3489 /// atomic will be generated.
3490 /// \param UpdateOp Code generator for complex expressions that cannot be
3491 /// expressed through atomicrmw instruction.
3492 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3493 /// update expression, false otherwise.
3494 /// (e.g. true for X = X BinOp Expr)
3495 ///
3496 /// \return Insertion point after generated atomic update IR.
3497 LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(
3498 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3499 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3500 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
3501 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3502 bool IsRemoteMemory = false);
3503
3504 /// Emit atomic update for constructs: --- Only Scalar data types
3505 /// V = X; X = X BinOp Expr ,
3506 /// X = X BinOp Expr; V = X,
3507 /// V = X; X = Expr BinOp X,
3508 /// X = Expr BinOp X; V = X,
3509 /// V = X; X = UpdateOp(X),
3510 /// X = UpdateOp(X); V = X,
3511 ///
3512 /// \param Loc The insert and source location description.
3513 /// \param AllocaIP The insertion point to be used for alloca instructions.
3514 /// \param X The target atomic pointer to be updated
3515 /// \param V Memory address where to store captured value
3516 /// \param Expr The value to update X with.
3517 /// \param AO Atomic ordering of the generated atomic instructions
3518 /// \param RMWOp The binary operation used for update. If
3519 /// operation is not supported by atomicRMW, or belong to
3520 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3521 /// atomic will be generated.
3522 /// \param UpdateOp Code generator for complex expressions that cannot be
3523 /// expressed through atomicrmw instruction.
3524 /// \param UpdateExpr true if X is an in place update of the form
3525 /// X = X BinOp Expr or X = Expr BinOp X
3526 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3527 /// update expression, false otherwise.
3528 /// (e.g. true for X = X BinOp Expr)
3529 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3530 /// 'v', not an updated one.
3531 ///
3532 /// \return Insertion point after generated atomic capture IR.
3533 LLVM_ABI InsertPointOrErrorTy createAtomicCapture(
3534 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3535 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
3536 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
3537 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
3538 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3539 bool IsRemoteMemory = false);
3540
3541 /// Emit atomic compare for constructs: --- Only scalar data types
3542 /// cond-expr-stmt:
3543 /// x = x ordop expr ? expr : x;
3544 /// x = expr ordop x ? expr : x;
3545 /// x = x == e ? d : x;
3546 /// x = e == x ? d : x; (this one is not in the spec)
3547 /// cond-update-stmt:
3548 /// if (x ordop expr) { x = expr; }
3549 /// if (expr ordop x) { x = expr; }
3550 /// if (x == e) { x = d; }
3551 /// if (e == x) { x = d; } (this one is not in the spec)
3552 /// conditional-update-capture-atomic:
3553 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3554 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3555 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3556 /// IsFailOnly=true)
3557 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3558 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3559 /// IsFailOnly=true)
3560 ///
3561 /// \param Loc The insert and source location description.
3562 /// \param X The target atomic pointer to be updated.
3563 /// \param V Memory address where to store captured value (for
3564 /// compare capture only).
3565 /// \param R Memory address where to store comparison result
3566 /// (for compare capture with '==' only).
3567 /// \param E The expected value ('e') for forms that use an
3568 /// equality comparison or an expression ('expr') for
3569 /// forms that use 'ordop' (logically an atomic maximum or
3570 /// minimum).
3571 /// \param D The desired value for forms that use an equality
3572 /// comparison. If forms that use 'ordop', it should be
3573 /// \p nullptr.
3574 /// \param AO Atomic ordering of the generated atomic instructions.
3575 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3576 /// \param IsXBinopExpr True if the conditional statement is in the form where
3577 /// x is on LHS. It only matters for < or >.
3578 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3579 /// 'v', not an updated one (for compare capture
3580 /// only).
3581 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3582 /// only when the comparison fails. This is only valid for
3583 /// the case the comparison is '=='.
3584 ///
3585 /// \return Insertion point after generated atomic capture IR.
3586 LLVM_ABI InsertPointTy
3587 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3588 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3589 AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3590 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3591 LLVM_ABI InsertPointTy createAtomicCompare(
3592 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
3593 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
3594 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
3595 bool IsFailOnly, AtomicOrdering Failure);
3596
3597 /// Create the control flow structure of a canonical OpenMP loop.
3598 ///
3599 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3600 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3601 /// IRBuilder location is not preserved.
3602 ///
3603 /// \param DL DebugLoc used for the instructions in the skeleton.
3604 /// \param TripCount Value to be used for the trip count.
3605 /// \param F Function in which to insert the BasicBlocks.
3606 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3607 /// typically the body itself.
3608 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3609 /// \param Name Base name used to derive BB
3610 /// and instruction names.
3611 ///
3612 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3613 LLVM_ABI CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3614 Function *F,
3615 BasicBlock *PreInsertBefore,
3616 BasicBlock *PostInsertBefore,
3617 const Twine &Name = {});
3618 /// OMP Offload Info Metadata name string
3619 const std::string ompOffloadInfoName = "omp_offload.info";
3620
3621 /// Loads all the offload entries information from the host IR
3622 /// metadata. This function is only meant to be used with device code
3623 /// generation.
3624 ///
3625 /// \param M Module to load Metadata info from. Module passed maybe
3626 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3627 LLVM_ABI void loadOffloadInfoMetadata(Module &M);
3628
3629 /// Loads all the offload entries information from the host IR
3630 /// metadata read from the file passed in as the HostFilePath argument. This
3631 /// function is only meant to be used with device code generation.
3632 ///
3633 /// \param HostFilePath The path to the host IR file,
3634 /// used to load in offload metadata for the device, allowing host and device
3635 /// to maintain the same metadata mapping.
3636 LLVM_ABI void loadOffloadInfoMetadata(vfs::FileSystem &VFS,
3637 StringRef HostFilePath);
3638
3639 /// Gets (if variable with the given name already exist) or creates
3640 /// internal global variable with the specified Name. The created variable has
3641 /// linkage CommonLinkage by default and is initialized by null value.
3642 /// \param Ty Type of the global variable. If it is exist already the type
3643 /// must be the same.
3644 /// \param Name Name of the variable.
3645 LLVM_ABI GlobalVariable *
3646 getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3647 unsigned AddressSpace = 0);
3648};
3649
3650/// Class to represented the control flow structure of an OpenMP canonical loop.
3651///
3652/// The control-flow structure is standardized for easy consumption by
3653/// directives associated with loops. For instance, the worksharing-loop
3654/// construct may change this control flow such that each loop iteration is
3655/// executed on only one thread. The constraints of a canonical loop in brief
3656/// are:
3657///
3658/// * The number of loop iterations must have been computed before entering the
3659/// loop.
3660///
3661/// * Has an (unsigned) logical induction variable that starts at zero and
3662/// increments by one.
3663///
3664/// * The loop's CFG itself has no side-effects. The OpenMP specification
3665/// itself allows side-effects, but the order in which they happen, including
3666/// how often or whether at all, is unspecified. We expect that the frontend
3667/// will emit those side-effect instructions somewhere (e.g. before the loop)
3668/// such that the CanonicalLoopInfo itself can be side-effect free.
3669///
3670/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3671/// execution of a loop body that satifies these constraints. It does NOT
3672/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3673/// CanonicalLoopInfo for such purposes.
3674///
3675/// The control flow can be described as follows:
3676///
3677/// Preheader
3678/// |
3679/// /-> Header
3680/// | |
3681/// | Cond---\
3682/// | | |
3683/// | Body |
3684/// | | | |
3685/// | <...> |
3686/// | | | |
3687/// \--Latch |
3688/// |
3689/// Exit
3690/// |
3691/// After
3692///
3693/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3694/// including) and end at AfterIP (at the After's first instruction, excluding).
3695/// That is, instructions in the Preheader and After blocks (except the
3696/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3697/// side-effects. Typically, the Preheader is used to compute the loop's trip
3698/// count. The instructions from BodyIP (at the Body block's first instruction,
3699/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3700/// control and thus can have side-effects. The body block is the single entry
3701/// point into the loop body, which may contain arbitrary control flow as long
3702/// as all control paths eventually branch to the Latch block.
3703///
3704/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3705/// Latch to guarantee that there is only a single edge to the latch. It would
3706/// make loop transformations easier to not needing to consider multiple
3707/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3708/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3709/// executes after each body iteration.
3710///
3711/// There must be no loop-carried dependencies through llvm::Values. This is
3712/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3713/// for the induction variable.
3714///
3715/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3716/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3717/// by assertOK(). They are expected to not be modified unless explicitly
3718/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3719/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3720/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3721/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3722/// anymore as its underlying control flow may not exist anymore.
3723/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3724/// may also return a new CanonicalLoopInfo that can be passed to other
3725/// loop-associated construct implementing methods. These loop-transforming
3726/// methods may either create a new CanonicalLoopInfo usually using
3727/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3728/// modify one of the input CanonicalLoopInfo and return it as representing the
3729/// modified loop. What is done is an implementation detail of
3730/// transformation-implementing method and callers should always assume that the
3731/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3732/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3733/// created by createCanonicalLoop, such that transforming methods do not have
3734/// to special case where the CanonicalLoopInfo originated from.
3735///
3736/// Generally, methods consuming CanonicalLoopInfo do not need an
3737/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3738/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3739/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3740/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3741/// any InsertPoint in the Preheader, After or Block can still be used after
3742/// calling such a method.
3743///
3744/// TODO: Provide mechanisms for exception handling and cancellation points.
3745///
3746/// Defined outside OpenMPIRBuilder because nested classes cannot be
3747/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3748class CanonicalLoopInfo {
3749 friend class OpenMPIRBuilder;
3750
3751private:
3752 BasicBlock *Header = nullptr;
3753 BasicBlock *Cond = nullptr;
3754 BasicBlock *Latch = nullptr;
3755 BasicBlock *Exit = nullptr;
3756
3757 // Hold the MLIR value for the `lastiter` of the canonical loop.
3758 Value *LastIter = nullptr;
3759
3760 /// Add the control blocks of this loop to \p BBs.
3761 ///
3762 /// This does not include any block from the body, including the one returned
3763 /// by getBody().
3764 ///
3765 /// FIXME: This currently includes the Preheader and After blocks even though
3766 /// their content is (mostly) not under CanonicalLoopInfo's control.
3767 /// Re-evaluated whether this makes sense.
3768 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3769
3770 /// Sets the number of loop iterations to the given value. This value must be
3771 /// valid in the condition block (i.e., defined in the preheader) and is
3772 /// interpreted as an unsigned integer.
3773 void setTripCount(Value *TripCount);
3774
3775 /// Replace all uses of the canonical induction variable in the loop body with
3776 /// a new one.
3777 ///
3778 /// The intended use case is to update the induction variable for an updated
3779 /// iteration space such that it can stay normalized in the 0...tripcount-1
3780 /// range.
3781 ///
3782 /// The \p Updater is called with the (presumable updated) current normalized
3783 /// induction variable and is expected to return the value that uses of the
3784 /// pre-updated induction values should use instead, typically dependent on
3785 /// the new induction variable. This is a lambda (instead of e.g. just passing
3786 /// the new value) to be able to distinguish the uses of the pre-updated
3787 /// induction variable and uses of the induction varible to compute the
3788 /// updated induction variable value.
3789 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3790
3791public:
3792 /// Sets the last iteration variable for this loop.
3793 void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); }
3794
3795 /// Returns the last iteration variable for this loop.
3796 /// Certain use-cases (like translation of linear clause) may access
3797 /// this variable even after a loop transformation. Hence, do not guard
3798 /// this getter function by `isValid`. It is the responsibility of the
3799 /// callee to ensure this functionality is not invoked by a non-outlined
3800 /// CanonicalLoopInfo object (in which case, `setLastIter` will never be
3801 /// invoked and `LastIter` will be by default `nullptr`).
3802 Value *getLastIter() { return LastIter; }
3803
3804 /// Returns whether this object currently represents the IR of a loop. If
3805 /// returning false, it may have been consumed by a loop transformation or not
3806 /// been intialized. Do not use in this case;
3807 bool isValid() const { return Header; }
3808
3809 /// The preheader ensures that there is only a single edge entering the loop.
3810 /// Code that must be execute before any loop iteration can be emitted here,
3811 /// such as computing the loop trip count and begin lifetime markers. Code in
3812 /// the preheader is not considered part of the canonical loop.
3813 LLVM_ABI BasicBlock *getPreheader() const;
3814
3815 /// The header is the entry for each iteration. In the canonical control flow,
3816 /// it only contains the PHINode for the induction variable.
3817 BasicBlock *getHeader() const {
3818 assert(isValid() && "Requires a valid canonical loop");
3819 return Header;
3820 }
3821
3822 /// The condition block computes whether there is another loop iteration. If
3823 /// yes, branches to the body; otherwise to the exit block.
3824 BasicBlock *getCond() const {
3825 assert(isValid() && "Requires a valid canonical loop");
3826 return Cond;
3827 }
3828
3829 /// The body block is the single entry for a loop iteration and not controlled
3830 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3831 /// eventually branch to the \p Latch block.
3832 BasicBlock *getBody() const {
3833 assert(isValid() && "Requires a valid canonical loop");
3834 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3835 }
3836
3837 /// Reaching the latch indicates the end of the loop body code. In the
3838 /// canonical control flow, it only contains the increment of the induction
3839 /// variable.
3840 BasicBlock *getLatch() const {
3841 assert(isValid() && "Requires a valid canonical loop");
3842 return Latch;
3843 }
3844
3845 /// Reaching the exit indicates no more iterations are being executed.
3846 BasicBlock *getExit() const {
3847 assert(isValid() && "Requires a valid canonical loop");
3848 return Exit;
3849 }
3850
3851 /// The after block is intended for clean-up code such as lifetime end
3852 /// markers. It is separate from the exit block to ensure, analogous to the
3853 /// preheader, it having just a single entry edge and being free from PHI
3854 /// nodes should there be multiple loop exits (such as from break
3855 /// statements/cancellations).
3856 BasicBlock *getAfter() const {
3857 assert(isValid() && "Requires a valid canonical loop");
3858 return Exit->getSingleSuccessor();
3859 }
3860
3861 /// Returns the llvm::Value containing the number of loop iterations. It must
3862 /// be valid in the preheader and always interpreted as an unsigned integer of
3863 /// any bit-width.
3864 Value *getTripCount() const {
3865 assert(isValid() && "Requires a valid canonical loop");
3866 Instruction *CmpI = &Cond->front();
3867 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3868 return CmpI->getOperand(1);
3869 }
3870
3871 /// Returns the instruction representing the current logical induction
3872 /// variable. Always unsigned, always starting at 0 with an increment of one.
3873 Instruction *getIndVar() const {
3874 assert(isValid() && "Requires a valid canonical loop");
3875 Instruction *IndVarPHI = &Header->front();
3876 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3877 return IndVarPHI;
3878 }
3879
3880 /// Return the type of the induction variable (and the trip count).
3881 Type *getIndVarType() const {
3882 assert(isValid() && "Requires a valid canonical loop");
3883 return getIndVar()->getType();
3884 }
3885
3886 /// Return the insertion point for user code before the loop.
3887 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
3888 assert(isValid() && "Requires a valid canonical loop");
3889 BasicBlock *Preheader = getPreheader();
3890 return {Preheader, std::prev(Preheader->end())};
3891 };
3892
3893 /// Return the insertion point for user code in the body.
3894 OpenMPIRBuilder::InsertPointTy getBodyIP() const {
3895 assert(isValid() && "Requires a valid canonical loop");
3896 BasicBlock *Body = getBody();
3897 return {Body, Body->begin()};
3898 };
3899
3900 /// Return the insertion point for user code after the loop.
3901 OpenMPIRBuilder::InsertPointTy getAfterIP() const {
3902 assert(isValid() && "Requires a valid canonical loop");
3903 BasicBlock *After = getAfter();
3904 return {After, After->begin()};
3905 };
3906
3907 Function *getFunction() const {
3908 assert(isValid() && "Requires a valid canonical loop");
3909 return Header->getParent();
3910 }
3911
3912 /// Consistency self-check.
3913 LLVM_ABI void assertOK() const;
3914
3915 /// Invalidate this loop. That is, the underlying IR does not fulfill the
3916 /// requirements of an OpenMP canonical loop anymore.
3917 LLVM_ABI void invalidate();
3918};
3919
3920/// ScanInfo holds the information to assist in lowering of Scan reduction.
3921/// Before lowering, the body of the for loop specifying scan reduction is
3922/// expected to have the following structure
3923///
3924/// Loop Body Entry
3925/// |
3926/// Code before the scan directive
3927/// |
3928/// Scan Directive
3929/// |
3930/// Code after the scan directive
3931/// |
3932/// Loop Body Exit
3933/// When `createCanonicalScanLoops` is executed, the bodyGen callback of it
3934/// transforms the body to:
3935///
3936/// Loop Body Entry
3937/// |
3938/// OMPScanDispatch
3939///
3940/// OMPBeforeScanBlock
3941/// |
3942/// OMPScanLoopExit
3943/// |
3944/// Loop Body Exit
3945///
3946/// The insert point is updated to the first insert point of OMPBeforeScanBlock.
3947/// It dominates the control flow of code generated until
3948/// scan directive is encountered and OMPAfterScanBlock dominates the
3949/// control flow of code generated after scan is encountered. The successor
3950/// of OMPScanDispatch can be OMPBeforeScanBlock or OMPAfterScanBlock based
3951/// on 1.whether it is in Input phase or Scan Phase , 2. whether it is an
3952/// exclusive or inclusive scan. This jump is added when `createScan` is
3953/// executed. If input loop is being generated, if it is inclusive scan,
3954/// `OMPAfterScanBlock` succeeds `OMPScanDispatch` , if exclusive,
3955/// `OMPBeforeScanBlock` succeeds `OMPDispatch` and vice versa for scan loop. At
3956/// the end of the input loop, temporary buffer is populated and at the
3957/// beginning of the scan loop, temporary buffer is read. After scan directive
3958/// is encountered, insertion point is updated to `OMPAfterScanBlock` as it is
3959/// expected to dominate the code after the scan directive. Both Before and
3960/// After scan blocks are succeeded by `OMPScanLoopExit`.
3961/// Temporary buffer allocations are done in `ScanLoopInit` block before the
3962/// lowering of for-loop. The results are copied back to reduction variable in
3963/// `ScanLoopFinish` block.
3964class ScanInfo {
3965public:
3966 /// Dominates the body of the loop before scan directive
3967 llvm::BasicBlock *OMPBeforeScanBlock = nullptr;
3968
3969 /// Dominates the body of the loop before scan directive
3970 llvm::BasicBlock *OMPAfterScanBlock = nullptr;
3971
3972 /// Controls the flow to before or after scan blocks
3973 llvm::BasicBlock *OMPScanDispatch = nullptr;
3974
3975 /// Exit block of loop body
3976 llvm::BasicBlock *OMPScanLoopExit = nullptr;
3977
3978 /// Block before loop body where scan initializations are done
3979 llvm::BasicBlock *OMPScanInit = nullptr;
3980
3981 /// Block after loop body where scan finalizations are done
3982 llvm::BasicBlock *OMPScanFinish = nullptr;
3983
3984 /// If true, it indicates Input phase is lowered; else it indicates
3985 /// ScanPhase is lowered
3986 bool OMPFirstScanLoop = false;
3987
3988 /// Maps the private reduction variable to the pointer of the temporary
3989 /// buffer
3990 llvm::SmallDenseMap<llvm::Value *, llvm::Value *> *ScanBuffPtrs;
3991
3992 /// Keeps track of value of iteration variable for input/scan loop to be
3993 /// used for Scan directive lowering
3994 llvm::Value *IV;
3995
3996 /// Stores the span of canonical loop being lowered to be used for temporary
3997 /// buffer allocation or Finalization.
3998 llvm::Value *Span;
3999
4000 ScanInfo() {
4001 ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
4002 }
4003
4004 ~ScanInfo() { delete (ScanBuffPtrs); }
4005};
4006
4007} // end namespace llvm
4008
4009#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
This file defines the BumpPtrAllocator interface.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
DXIL Finalize Linkage
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix, bool &Renamed)
Get the name of a profiling variable for a particular function.
bool operator<(const DeltaInfo &LHS, int64_t Delta)
Definition LineTable.cpp:30
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
#define T
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static uint32_t getFlags(const Symbol *Sym)
Definition TapiFile.cpp:26
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
@ None
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
LLVM Basic Block Representation.
Definition BasicBlock.h:62
A debug info location.
Definition DebugLoc.h:124
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
The virtual file system interface.
LLVM_ABI bool isGPU(const Module &M)
Return true iff M target a GPU (and we can use GPU AS reasoning).
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:60
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:456
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1665
FunctionAddr VTableAddr Count
Definition InstrProf.h:139