LLVM 21.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/LLVMContext.h"
44#include "llvm/IR/MDBuilder.h"
45#include "llvm/IR/Metadata.h"
47#include "llvm/IR/PassManager.h"
49#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
155 if (T.isAMDGPU()) {
156 StringRef Features =
157 Kernel->getFnAttribute("target-features").getValueAsString();
158 if (Features.count("+wavefrontsize64"))
159 return omp::getAMDGPUGridValues<64>();
160 return omp::getAMDGPUGridValues<32>();
161 }
162 if (T.isNVPTX())
164 if (T.isSPIRV())
166 llvm_unreachable("No grid value available for this architecture!");
167}
168
169/// Determine which scheduling algorithm to use, determined from schedule clause
170/// arguments.
171static OMPScheduleType
172getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
173 bool HasSimdModifier) {
174 // Currently, the default schedule it static.
175 switch (ClauseKind) {
176 case OMP_SCHEDULE_Default:
177 case OMP_SCHEDULE_Static:
178 return HasChunks ? OMPScheduleType::BaseStaticChunked
179 : OMPScheduleType::BaseStatic;
180 case OMP_SCHEDULE_Dynamic:
181 return OMPScheduleType::BaseDynamicChunked;
182 case OMP_SCHEDULE_Guided:
183 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
184 : OMPScheduleType::BaseGuidedChunked;
185 case OMP_SCHEDULE_Auto:
187 case OMP_SCHEDULE_Runtime:
188 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
189 : OMPScheduleType::BaseRuntime;
190 }
191 llvm_unreachable("unhandled schedule clause argument");
192}
193
194/// Adds ordering modifier flags to schedule type.
195static OMPScheduleType
197 bool HasOrderedClause) {
198 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
199 OMPScheduleType::None &&
200 "Must not have ordering nor monotonicity flags already set");
201
202 OMPScheduleType OrderingModifier = HasOrderedClause
203 ? OMPScheduleType::ModifierOrdered
204 : OMPScheduleType::ModifierUnordered;
205 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
206
207 // Unsupported combinations
208 if (OrderingScheduleType ==
209 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
210 return OMPScheduleType::OrderedGuidedChunked;
211 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
212 OMPScheduleType::ModifierOrdered))
213 return OMPScheduleType::OrderedRuntime;
214
215 return OrderingScheduleType;
216}
217
218/// Adds monotonicity modifier flags to schedule type.
219static OMPScheduleType
221 bool HasSimdModifier, bool HasMonotonic,
222 bool HasNonmonotonic, bool HasOrderedClause) {
223 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
224 OMPScheduleType::None &&
225 "Must not have monotonicity flags already set");
226 assert((!HasMonotonic || !HasNonmonotonic) &&
227 "Monotonic and Nonmonotonic are contradicting each other");
228
229 if (HasMonotonic) {
230 return ScheduleType | OMPScheduleType::ModifierMonotonic;
231 } else if (HasNonmonotonic) {
232 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
233 } else {
234 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
235 // If the static schedule kind is specified or if the ordered clause is
236 // specified, and if the nonmonotonic modifier is not specified, the
237 // effect is as if the monotonic modifier is specified. Otherwise, unless
238 // the monotonic modifier is specified, the effect is as if the
239 // nonmonotonic modifier is specified.
240 OMPScheduleType BaseScheduleType =
241 ScheduleType & ~OMPScheduleType::ModifierMask;
242 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
243 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
244 HasOrderedClause) {
245 // The monotonic is used by default in openmp runtime library, so no need
246 // to set it.
247 return ScheduleType;
248 } else {
249 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
250 }
251 }
252}
253
254/// Determine the schedule type using schedule and ordering clause arguments.
255static OMPScheduleType
256computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
257 bool HasSimdModifier, bool HasMonotonicModifier,
258 bool HasNonmonotonicModifier, bool HasOrderedClause) {
259 OMPScheduleType BaseSchedule =
260 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
261 OMPScheduleType OrderedSchedule =
262 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
264 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
265 HasNonmonotonicModifier, HasOrderedClause);
266
268 return Result;
269}
270
271/// Emit an implicit cast to convert \p XRead to type of variable \p V
273 llvm::Value *V) {
274 // TODO: Add this functionality to the `AtomicInfo` interface
275 llvm::Type *XReadType = XRead->getType();
276 llvm::Type *VType = V->getType();
277 if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
278 VType = vAlloca->getAllocatedType();
279
280 if (XReadType->isStructTy() && VType->isStructTy())
281 // No need to extract or convert. A direct
282 // `store` will suffice.
283 return XRead;
284
285 if (XReadType->isStructTy())
286 XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
287 if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
288 XRead = Builder.CreateFPToSI(XRead, VType);
289 else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
290 XRead = Builder.CreateSIToFP(XRead, VType);
291 else if (VType->isIntegerTy() && XReadType->isIntegerTy())
292 XRead = Builder.CreateIntCast(XRead, VType, true);
293 else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
294 XRead = Builder.CreateFPCast(XRead, VType);
295 return XRead;
296}
297
298/// Make \p Source branch to \p Target.
299///
300/// Handles two situations:
301/// * \p Source already has an unconditional branch.
302/// * \p Source is a degenerate block (no terminator because the BB is
303/// the current head of the IR construction).
305 if (Instruction *Term = Source->getTerminator()) {
306 auto *Br = cast<BranchInst>(Term);
307 assert(!Br->isConditional() &&
308 "BB's terminator must be an unconditional branch (or degenerate)");
309 BasicBlock *Succ = Br->getSuccessor(0);
310 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
311 Br->setSuccessor(0, Target);
312 return;
313 }
314
315 auto *NewBr = BranchInst::Create(Target, Source);
316 NewBr->setDebugLoc(DL);
317}
318
320 bool CreateBranch, DebugLoc DL) {
321 assert(New->getFirstInsertionPt() == New->begin() &&
322 "Target BB must not have PHI nodes");
323
324 // Move instructions to new block.
325 BasicBlock *Old = IP.getBlock();
326 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
327
328 if (CreateBranch) {
329 auto *NewBr = BranchInst::Create(New, Old);
330 NewBr->setDebugLoc(DL);
331 }
332}
333
334void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
336 BasicBlock *Old = Builder.GetInsertBlock();
337
338 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
339 if (CreateBranch)
340 Builder.SetInsertPoint(Old->getTerminator());
341 else
342 Builder.SetInsertPoint(Old);
343
344 // SetInsertPoint also updates the Builder's debug location, but we want to
345 // keep the one the Builder was configured to use.
347}
348
351 BasicBlock *Old = IP.getBlock();
353 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
354 Old->getParent(), Old->getNextNode());
355 spliceBB(IP, New, CreateBranch, DL);
356 New->replaceSuccessorsPhiUsesWith(Old, New);
357 return New;
358}
359
360BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
363 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
364 if (CreateBranch)
365 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
366 else
367 Builder.SetInsertPoint(Builder.GetInsertBlock());
368 // SetInsertPoint also updates the Builder's debug location, but we want to
369 // keep the one the Builder was configured to use.
371 return New;
372}
373
374BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
377 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
378 if (CreateBranch)
379 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
380 else
381 Builder.SetInsertPoint(Builder.GetInsertBlock());
382 // SetInsertPoint also updates the Builder's debug location, but we want to
383 // keep the one the Builder was configured to use.
385 return New;
386}
387
389 llvm::Twine Suffix) {
390 BasicBlock *Old = Builder.GetInsertBlock();
391 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
392}
393
394// This function creates a fake integer value and a fake use for the integer
395// value. It returns the fake value created. This is useful in modeling the
396// extra arguments to the outlined functions.
398 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
400 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
401 const Twine &Name = "", bool AsPtr = true) {
402 Builder.restoreIP(OuterAllocaIP);
403 Instruction *FakeVal;
404 AllocaInst *FakeValAddr =
405 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
406 ToBeDeleted.push_back(FakeValAddr);
407
408 if (AsPtr) {
409 FakeVal = FakeValAddr;
410 } else {
411 FakeVal =
412 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
413 ToBeDeleted.push_back(FakeVal);
414 }
415
416 // Generate a fake use of this value
417 Builder.restoreIP(InnerAllocaIP);
418 Instruction *UseFakeVal;
419 if (AsPtr) {
420 UseFakeVal =
421 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
422 } else {
423 UseFakeVal =
424 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
425 }
426 ToBeDeleted.push_back(UseFakeVal);
427 return FakeVal;
428}
429
430//===----------------------------------------------------------------------===//
431// OpenMPIRBuilderConfig
432//===----------------------------------------------------------------------===//
433
434namespace {
436/// Values for bit flags for marking which requires clauses have been used.
437enum OpenMPOffloadingRequiresDirFlags {
438 /// flag undefined.
439 OMP_REQ_UNDEFINED = 0x000,
440 /// no requires directive present.
441 OMP_REQ_NONE = 0x001,
442 /// reverse_offload clause.
443 OMP_REQ_REVERSE_OFFLOAD = 0x002,
444 /// unified_address clause.
445 OMP_REQ_UNIFIED_ADDRESS = 0x004,
446 /// unified_shared_memory clause.
447 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
448 /// dynamic_allocators clause.
449 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
450 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
451};
452
453} // anonymous namespace
454
456 : RequiresFlags(OMP_REQ_UNDEFINED) {}
457
459 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
460 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
461 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
462 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
463 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
464 RequiresFlags(OMP_REQ_UNDEFINED) {
465 if (HasRequiresReverseOffload)
466 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
467 if (HasRequiresUnifiedAddress)
468 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
469 if (HasRequiresUnifiedSharedMemory)
470 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
471 if (HasRequiresDynamicAllocators)
472 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
473}
474
476 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
477}
478
480 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
481}
482
484 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
485}
486
488 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
492 return hasRequiresFlags() ? RequiresFlags
493 : static_cast<int64_t>(OMP_REQ_NONE);
494}
495
497 if (Value)
498 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
499 else
500 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
501}
502
504 if (Value)
505 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
506 else
507 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
508}
509
511 if (Value)
512 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
513 else
514 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
515}
516
518 if (Value)
519 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
520 else
521 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
522}
523
524//===----------------------------------------------------------------------===//
525// OpenMPIRBuilder
526//===----------------------------------------------------------------------===//
527
529 IRBuilderBase &Builder,
530 SmallVector<Value *> &ArgsVector) {
532 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
533 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
534 constexpr const size_t MaxDim = 3;
535 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
536 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
537
538 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
539
540 Value *NumTeams3D =
541 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
542 Value *NumThreads3D =
543 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
544 for (unsigned I :
545 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
546 NumTeams3D =
547 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
548 for (unsigned I :
549 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
550 NumThreads3D =
551 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
552
553 ArgsVector = {Version,
554 PointerNum,
555 KernelArgs.RTArgs.BasePointersArray,
556 KernelArgs.RTArgs.PointersArray,
557 KernelArgs.RTArgs.SizesArray,
558 KernelArgs.RTArgs.MapTypesArray,
559 KernelArgs.RTArgs.MapNamesArray,
560 KernelArgs.RTArgs.MappersArray,
561 KernelArgs.NumIterations,
562 Flags,
563 NumTeams3D,
564 NumThreads3D,
565 KernelArgs.DynCGGroupMem};
566}
567
569 LLVMContext &Ctx = Fn.getContext();
570
571 // Get the function's current attributes.
572 auto Attrs = Fn.getAttributes();
573 auto FnAttrs = Attrs.getFnAttrs();
574 auto RetAttrs = Attrs.getRetAttrs();
576 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
577 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
578
579 // Add AS to FnAS while taking special care with integer extensions.
580 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
581 bool Param = true) -> void {
582 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
583 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
584 if (HasSignExt || HasZeroExt) {
585 assert(AS.getNumAttributes() == 1 &&
586 "Currently not handling extension attr combined with others.");
587 if (Param) {
588 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else if (auto AK =
591 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
592 FnAS = FnAS.addAttribute(Ctx, AK);
593 } else {
594 FnAS = FnAS.addAttributes(Ctx, AS);
595 }
596 };
597
598#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
599#include "llvm/Frontend/OpenMP/OMPKinds.def"
600
601 // Add attributes to the function declaration.
602 switch (FnID) {
603#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
604 case Enum: \
605 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
606 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
607 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
608 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
609 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
610 break;
611#include "llvm/Frontend/OpenMP/OMPKinds.def"
612 default:
613 // Attributes are optional.
614 break;
615 }
616}
617
620 FunctionType *FnTy = nullptr;
621 Function *Fn = nullptr;
622
623 // Try to find the declation in the module first.
624 switch (FnID) {
625#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
626 case Enum: \
627 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
628 IsVarArg); \
629 Fn = M.getFunction(Str); \
630 break;
631#include "llvm/Frontend/OpenMP/OMPKinds.def"
632 }
633
634 if (!Fn) {
635 // Create a new declaration if we need one.
636 switch (FnID) {
637#define OMP_RTL(Enum, Str, ...) \
638 case Enum: \
639 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
640 break;
641#include "llvm/Frontend/OpenMP/OMPKinds.def"
642 }
643
644 // Add information if the runtime function takes a callback function
645 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
646 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
647 LLVMContext &Ctx = Fn->getContext();
648 MDBuilder MDB(Ctx);
649 // Annotate the callback behavior of the runtime function:
650 // - The callback callee is argument number 2 (microtask).
651 // - The first two arguments of the callback callee are unknown (-1).
652 // - All variadic arguments to the runtime function are passed to the
653 // callback callee.
654 Fn->addMetadata(
655 LLVMContext::MD_callback,
657 2, {-1, -1}, /* VarArgsArePassed */ true)}));
658 }
659 }
660
661 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
662 << " with type " << *Fn->getFunctionType() << "\n");
663 addAttributes(FnID, *Fn);
664
665 } else {
666 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
667 << " with type " << *Fn->getFunctionType() << "\n");
668 }
669
670 assert(Fn && "Failed to create OpenMP runtime function");
671
672 return {FnTy, Fn};
673}
674
677 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
678 assert(Fn && "Failed to create OpenMP runtime function pointer");
679 return Fn;
680}
681
682void OpenMPIRBuilder::initialize() { initializeTypes(M); }
683
686 BasicBlock &EntryBlock = Function->getEntryBlock();
687 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
688
689 // Loop over blocks looking for constant allocas, skipping the entry block
690 // as any allocas there are already in the desired location.
691 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
692 Block++) {
693 for (auto Inst = Block->getReverseIterator()->begin();
694 Inst != Block->getReverseIterator()->end();) {
695 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
696 Inst++;
697 if (!isa<ConstantData>(AllocaInst->getArraySize()))
698 continue;
699 AllocaInst->moveBeforePreserving(MoveLocInst);
700 } else {
701 Inst++;
702 }
703 }
704 }
705}
706
708 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
710 SmallVector<OutlineInfo, 16> DeferredOutlines;
711 for (OutlineInfo &OI : OutlineInfos) {
712 // Skip functions that have not finalized yet; may happen with nested
713 // function generation.
714 if (Fn && OI.getFunction() != Fn) {
715 DeferredOutlines.push_back(OI);
716 continue;
717 }
718
719 ParallelRegionBlockSet.clear();
720 Blocks.clear();
721 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
722
723 Function *OuterFn = OI.getFunction();
724 CodeExtractorAnalysisCache CEAC(*OuterFn);
725 // If we generate code for the target device, we need to allocate
726 // struct for aggregate params in the device default alloca address space.
727 // OpenMP runtime requires that the params of the extracted functions are
728 // passed as zero address space pointers. This flag ensures that
729 // CodeExtractor generates correct code for extracted functions
730 // which are used by OpenMP runtime.
731 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
732 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
733 /* AggregateArgs */ true,
734 /* BlockFrequencyInfo */ nullptr,
735 /* BranchProbabilityInfo */ nullptr,
736 /* AssumptionCache */ nullptr,
737 /* AllowVarArgs */ true,
738 /* AllowAlloca */ true,
739 /* AllocaBlock*/ OI.OuterAllocaBB,
740 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
741
742 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
743 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
744 << " Exit: " << OI.ExitBB->getName() << "\n");
745 assert(Extractor.isEligible() &&
746 "Expected OpenMP outlining to be possible!");
747
748 for (auto *V : OI.ExcludeArgsFromAggregate)
749 Extractor.excludeArgFromAggregate(V);
750
751 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
752
753 // Forward target-cpu, target-features attributes to the outlined function.
754 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
755 if (TargetCpuAttr.isStringAttribute())
756 OutlinedFn->addFnAttr(TargetCpuAttr);
757
758 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
759 if (TargetFeaturesAttr.isStringAttribute())
760 OutlinedFn->addFnAttr(TargetFeaturesAttr);
761
762 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
763 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
764 assert(OutlinedFn->getReturnType()->isVoidTy() &&
765 "OpenMP outlined functions should not return a value!");
766
767 // For compability with the clang CG we move the outlined function after the
768 // one with the parallel region.
769 OutlinedFn->removeFromParent();
770 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
771
772 // Remove the artificial entry introduced by the extractor right away, we
773 // made our own entry block after all.
774 {
775 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
776 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
777 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
778 // Move instructions from the to-be-deleted ArtificialEntry to the entry
779 // basic block of the parallel region. CodeExtractor generates
780 // instructions to unwrap the aggregate argument and may sink
781 // allocas/bitcasts for values that are solely used in the outlined region
782 // and do not escape.
783 assert(!ArtificialEntry.empty() &&
784 "Expected instructions to add in the outlined region entry");
785 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
786 End = ArtificialEntry.rend();
787 It != End;) {
788 Instruction &I = *It;
789 It++;
790
791 if (I.isTerminator())
792 continue;
793
794 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
795 }
796
797 OI.EntryBB->moveBefore(&ArtificialEntry);
798 ArtificialEntry.eraseFromParent();
799 }
800 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
801 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
802
803 // Run a user callback, e.g. to add attributes.
804 if (OI.PostOutlineCB)
805 OI.PostOutlineCB(*OutlinedFn);
806 }
807
808 // Remove work items that have been completed.
809 OutlineInfos = std::move(DeferredOutlines);
810
811 // The createTarget functions embeds user written code into
812 // the target region which may inject allocas which need to
813 // be moved to the entry block of our target or risk malformed
814 // optimisations by later passes, this is only relevant for
815 // the device pass which appears to be a little more delicate
816 // when it comes to optimisations (however, we do not block on
817 // that here, it's up to the inserter to the list to do so).
818 // This notbaly has to occur after the OutlinedInfo candidates
819 // have been extracted so we have an end product that will not
820 // be implicitly adversely affected by any raises unless
821 // intentionally appended to the list.
822 // NOTE: This only does so for ConstantData, it could be extended
823 // to ConstantExpr's with further effort, however, they should
824 // largely be folded when they get here. Extending it to runtime
825 // defined/read+writeable allocation sizes would be non-trivial
826 // (need to factor in movement of any stores to variables the
827 // allocation size depends on, as well as the usual loads,
828 // otherwise it'll yield the wrong result after movement) and
829 // likely be more suitable as an LLVM optimisation pass.
832
833 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
834 [](EmitMetadataErrorKind Kind,
835 const TargetRegionEntryInfo &EntryInfo) -> void {
836 errs() << "Error of kind: " << Kind
837 << " when emitting offload entries and metadata during "
838 "OMPIRBuilder finalization \n";
839 };
840
843
844 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
845 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
846 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
847 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
848 }
849}
850
852 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
853}
854
857 auto *GV =
858 new GlobalVariable(M, I32Ty,
859 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
860 ConstantInt::get(I32Ty, Value), Name);
861 GV->setVisibility(GlobalValue::HiddenVisibility);
862
863 return GV;
864}
865
867 if (List.empty())
868 return;
869
870 // Convert List to what ConstantArray needs.
872 UsedArray.resize(List.size());
873 for (unsigned I = 0, E = List.size(); I != E; ++I)
875 cast<Constant>(&*List[I]), Builder.getPtrTy());
876
877 if (UsedArray.empty())
878 return;
879 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
880
881 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
882 ConstantArray::get(ATy, UsedArray), Name);
883
884 GV->setSection("llvm.metadata");
885}
886
889 OMPTgtExecModeFlags Mode) {
890 auto *Int8Ty = Builder.getInt8Ty();
891 auto *GVMode = new GlobalVariable(
892 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
893 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
894 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
895 return GVMode;
896}
897
899 uint32_t SrcLocStrSize,
900 IdentFlag LocFlags,
901 unsigned Reserve2Flags) {
902 // Enable "C-mode".
903 LocFlags |= OMP_IDENT_FLAG_KMPC;
904
905 Constant *&Ident =
906 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
907 if (!Ident) {
909 Constant *IdentData[] = {I32Null,
910 ConstantInt::get(Int32, uint32_t(LocFlags)),
911 ConstantInt::get(Int32, Reserve2Flags),
912 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
913 Constant *Initializer =
914 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
915
916 // Look for existing encoding of the location + flags, not needed but
917 // minimizes the difference to the existing solution while we transition.
918 for (GlobalVariable &GV : M.globals())
919 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
920 if (GV.getInitializer() == Initializer)
921 Ident = &GV;
922
923 if (!Ident) {
924 auto *GV = new GlobalVariable(
925 M, OpenMPIRBuilder::Ident,
926 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
929 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
930 GV->setAlignment(Align(8));
931 Ident = GV;
932 }
933 }
934
936}
937
939 uint32_t &SrcLocStrSize) {
940 SrcLocStrSize = LocStr.size();
941 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
942 if (!SrcLocStr) {
943 Constant *Initializer =
945
946 // Look for existing encoding of the location, not needed but minimizes the
947 // difference to the existing solution while we transition.
948 for (GlobalVariable &GV : M.globals())
949 if (GV.isConstant() && GV.hasInitializer() &&
950 GV.getInitializer() == Initializer)
951 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
952
953 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
954 /* AddressSpace */ 0, &M);
955 }
956 return SrcLocStr;
957}
958
960 StringRef FileName,
961 unsigned Line, unsigned Column,
962 uint32_t &SrcLocStrSize) {
963 SmallString<128> Buffer;
964 Buffer.push_back(';');
965 Buffer.append(FileName);
966 Buffer.push_back(';');
967 Buffer.append(FunctionName);
968 Buffer.push_back(';');
969 Buffer.append(std::to_string(Line));
970 Buffer.push_back(';');
971 Buffer.append(std::to_string(Column));
972 Buffer.push_back(';');
973 Buffer.push_back(';');
974 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
975}
976
977Constant *
979 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
980 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
981}
982
984 uint32_t &SrcLocStrSize,
985 Function *F) {
986 DILocation *DIL = DL.get();
987 if (!DIL)
988 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
989 StringRef FileName = M.getName();
990 if (DIFile *DIF = DIL->getFile())
991 if (std::optional<StringRef> Source = DIF->getSource())
992 FileName = *Source;
993 StringRef Function = DIL->getScope()->getSubprogram()->getName();
994 if (Function.empty() && F)
995 Function = F->getName();
996 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
997 DIL->getColumn(), SrcLocStrSize);
998}
999
1001 uint32_t &SrcLocStrSize) {
1002 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1003 Loc.IP.getBlock()->getParent());
1004}
1005
1007 return Builder.CreateCall(
1008 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1009 "omp_global_thread_num");
1010}
1011
1014 bool ForceSimpleCall, bool CheckCancelFlag) {
1015 if (!updateToLocation(Loc))
1016 return Loc.IP;
1017
1018 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1019 // __kmpc_barrier(loc, thread_id);
1020
1021 IdentFlag BarrierLocFlags;
1022 switch (Kind) {
1023 case OMPD_for:
1024 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1025 break;
1026 case OMPD_sections:
1027 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1028 break;
1029 case OMPD_single:
1030 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1031 break;
1032 case OMPD_barrier:
1033 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1034 break;
1035 default:
1036 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1037 break;
1038 }
1039
1040 uint32_t SrcLocStrSize;
1041 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1042 Value *Args[] = {
1043 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1044 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1045
1046 // If we are in a cancellable parallel region, barriers are cancellation
1047 // points.
1048 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1049 bool UseCancelBarrier =
1050 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1051
1052 Value *Result =
1054 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1055 : OMPRTL___kmpc_barrier),
1056 Args);
1057
1058 if (UseCancelBarrier && CheckCancelFlag)
1059 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1060 return Err;
1061
1062 return Builder.saveIP();
1063}
1064
1067 Value *IfCondition,
1068 omp::Directive CanceledDirective) {
1069 if (!updateToLocation(Loc))
1070 return Loc.IP;
1071
1072 // LLVM utilities like blocks with terminators.
1073 auto *UI = Builder.CreateUnreachable();
1074
1075 Instruction *ThenTI = UI, *ElseTI = nullptr;
1076 if (IfCondition)
1077 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1078 Builder.SetInsertPoint(ThenTI);
1079
1080 Value *CancelKind = nullptr;
1081 switch (CanceledDirective) {
1082#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1083 case DirectiveEnum: \
1084 CancelKind = Builder.getInt32(Value); \
1085 break;
1086#include "llvm/Frontend/OpenMP/OMPKinds.def"
1087 default:
1088 llvm_unreachable("Unknown cancel kind!");
1089 }
1090
1091 uint32_t SrcLocStrSize;
1092 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1093 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1094 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1095 Value *Result = Builder.CreateCall(
1096 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1097 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1098 if (CanceledDirective == OMPD_parallel) {
1100 Builder.restoreIP(IP);
1102 omp::Directive::OMPD_unknown,
1103 /* ForceSimpleCall */ false,
1104 /* CheckCancelFlag */ false)
1105 .takeError();
1106 }
1107 return Error::success();
1108 };
1109
1110 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1111 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1112 return Err;
1113
1114 // Update the insertion point and remove the terminator we introduced.
1115 Builder.SetInsertPoint(UI->getParent());
1116 UI->eraseFromParent();
1117
1118 return Builder.saveIP();
1119}
1120
1122 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1123 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1124 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1125 if (!updateToLocation(Loc))
1126 return Loc.IP;
1127
1128 Builder.restoreIP(AllocaIP);
1129 auto *KernelArgsPtr =
1130 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1131 Builder.restoreIP(Loc.IP);
1132
1133 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1134 llvm::Value *Arg =
1135 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1137 KernelArgs[I], Arg,
1138 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1139 }
1140
1141 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1142 NumThreads, HostPtr, KernelArgsPtr};
1143
1144 Return = Builder.CreateCall(
1145 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1146 OffloadingArgs);
1147
1148 return Builder.saveIP();
1149}
1150
1152 const LocationDescription &Loc, Value *OutlinedFnID,
1153 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1154 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1155
1156 if (!updateToLocation(Loc))
1157 return Loc.IP;
1158
1159 Builder.restoreIP(Loc.IP);
1160 // On top of the arrays that were filled up, the target offloading call
1161 // takes as arguments the device id as well as the host pointer. The host
1162 // pointer is used by the runtime library to identify the current target
1163 // region, so it only has to be unique and not necessarily point to
1164 // anything. It could be the pointer to the outlined function that
1165 // implements the target region, but we aren't using that so that the
1166 // compiler doesn't need to keep that, and could therefore inline the host
1167 // function if proven worthwhile during optimization.
1168
1169 // From this point on, we need to have an ID of the target region defined.
1170 assert(OutlinedFnID && "Invalid outlined function ID!");
1171 (void)OutlinedFnID;
1172
1173 // Return value of the runtime offloading call.
1174 Value *Return = nullptr;
1175
1176 // Arguments for the target kernel.
1177 SmallVector<Value *> ArgsVector;
1178 getKernelArgsVector(Args, Builder, ArgsVector);
1179
1180 // The target region is an outlined function launched by the runtime
1181 // via calls to __tgt_target_kernel().
1182 //
1183 // Note that on the host and CPU targets, the runtime implementation of
1184 // these calls simply call the outlined function without forking threads.
1185 // The outlined functions themselves have runtime calls to
1186 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1187 // the compiler in emitTeamsCall() and emitParallelCall().
1188 //
1189 // In contrast, on the NVPTX target, the implementation of
1190 // __tgt_target_teams() launches a GPU kernel with the requested number
1191 // of teams and threads so no additional calls to the runtime are required.
1192 // Check the error code and execute the host version if required.
1194 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1195 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1196
1197 BasicBlock *OffloadFailedBlock =
1198 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1199 BasicBlock *OffloadContBlock =
1200 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1202 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1203
1204 auto CurFn = Builder.GetInsertBlock()->getParent();
1205 emitBlock(OffloadFailedBlock, CurFn);
1206 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1207 if (!AfterIP)
1208 return AfterIP.takeError();
1209 Builder.restoreIP(*AfterIP);
1210 emitBranch(OffloadContBlock);
1211 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1212 return Builder.saveIP();
1213}
1214
1216 Value *CancelFlag, omp::Directive CanceledDirective,
1217 FinalizeCallbackTy ExitCB) {
1218 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1219 "Unexpected cancellation!");
1220
1221 // For a cancel barrier we create two new blocks.
1223 BasicBlock *NonCancellationBlock;
1224 if (Builder.GetInsertPoint() == BB->end()) {
1225 // TODO: This branch will not be needed once we moved to the
1226 // OpenMPIRBuilder codegen completely.
1227 NonCancellationBlock = BasicBlock::Create(
1228 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1229 } else {
1230 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1233 }
1234 BasicBlock *CancellationBlock = BasicBlock::Create(
1235 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1236
1237 // Jump to them based on the return value.
1238 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1239 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1240 /* TODO weight */ nullptr, nullptr);
1241
1242 // From the cancellation block we finalize all variables and go to the
1243 // post finalization block that is known to the FiniCB callback.
1244 Builder.SetInsertPoint(CancellationBlock);
1245 if (ExitCB)
1246 if (Error Err = ExitCB(Builder.saveIP()))
1247 return Err;
1248 auto &FI = FinalizationStack.back();
1249 if (Error Err = FI.FiniCB(Builder.saveIP()))
1250 return Err;
1251
1252 // The continuation block is where code generation continues.
1253 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1254 return Error::success();
1255}
1256
1257// Callback used to create OpenMP runtime calls to support
1258// omp parallel clause for the device.
1259// We need to use this callback to replace call to the OutlinedFn in OuterFn
1260// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1262 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1263 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1264 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1265 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1266 // Add some known attributes.
1267 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1268 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1269 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1270 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1271 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1272 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1273
1274 assert(OutlinedFn.arg_size() >= 2 &&
1275 "Expected at least tid and bounded tid as arguments");
1276 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1277
1278 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1279 assert(CI && "Expected call instruction to outlined function");
1280 CI->getParent()->setName("omp_parallel");
1281
1282 Builder.SetInsertPoint(CI);
1283 Type *PtrTy = OMPIRBuilder->VoidPtr;
1284 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1285
1286 // Add alloca for kernel args
1287 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1288 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1289 AllocaInst *ArgsAlloca =
1290 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1291 Value *Args = ArgsAlloca;
1292 // Add address space cast if array for storing arguments is not allocated
1293 // in address space 0
1294 if (ArgsAlloca->getAddressSpace())
1295 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1296 Builder.restoreIP(CurrentIP);
1297
1298 // Store captured vars which are used by kmpc_parallel_51
1299 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1300 Value *V = *(CI->arg_begin() + 2 + Idx);
1301 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1302 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1303 Builder.CreateStore(V, StoreAddress);
1304 }
1305
1306 Value *Cond =
1307 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1308 : Builder.getInt32(1);
1309
1310 // Build kmpc_parallel_51 call
1311 Value *Parallel51CallArgs[] = {
1312 /* identifier*/ Ident,
1313 /* global thread num*/ ThreadID,
1314 /* if expression */ Cond,
1315 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1316 /* Proc bind */ Builder.getInt32(-1),
1317 /* outlined function */
1318 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1319 /* wrapper function */ NullPtrValue,
1320 /* arguments of the outlined funciton*/ Args,
1321 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1322
1323 FunctionCallee RTLFn =
1324 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1325
1326 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1327
1328 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1329 << *Builder.GetInsertBlock()->getParent() << "\n");
1330
1331 // Initialize the local TID stack location with the argument value.
1332 Builder.SetInsertPoint(PrivTID);
1333 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1334 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1335 PrivTIDAddr);
1336
1337 // Remove redundant call to the outlined function.
1338 CI->eraseFromParent();
1339
1340 for (Instruction *I : ToBeDeleted) {
1341 I->eraseFromParent();
1342 }
1343}
1344
1345// Callback used to create OpenMP runtime calls to support
1346// omp parallel clause for the host.
1347// We need to use this callback to replace call to the OutlinedFn in OuterFn
1348// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1349static void
1351 Function *OuterFn, Value *Ident, Value *IfCondition,
1352 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1353 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1354 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1355 FunctionCallee RTLFn;
1356 if (IfCondition) {
1357 RTLFn =
1358 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1359 } else {
1360 RTLFn =
1361 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1362 }
1363 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1364 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1365 LLVMContext &Ctx = F->getContext();
1366 MDBuilder MDB(Ctx);
1367 // Annotate the callback behavior of the __kmpc_fork_call:
1368 // - The callback callee is argument number 2 (microtask).
1369 // - The first two arguments of the callback callee are unknown (-1).
1370 // - All variadic arguments to the __kmpc_fork_call are passed to the
1371 // callback callee.
1372 F->addMetadata(LLVMContext::MD_callback,
1374 2, {-1, -1},
1375 /* VarArgsArePassed */ true)}));
1376 }
1377 }
1378 // Add some known attributes.
1379 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1380 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1381 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1382
1383 assert(OutlinedFn.arg_size() >= 2 &&
1384 "Expected at least tid and bounded tid as arguments");
1385 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1386
1387 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1388 CI->getParent()->setName("omp_parallel");
1389 Builder.SetInsertPoint(CI);
1390
1391 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1392 Value *ForkCallArgs[] = {
1393 Ident, Builder.getInt32(NumCapturedVars),
1394 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1395
1396 SmallVector<Value *, 16> RealArgs;
1397 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1398 if (IfCondition) {
1399 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1400 RealArgs.push_back(Cond);
1401 }
1402 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1403
1404 // __kmpc_fork_call_if always expects a void ptr as the last argument
1405 // If there are no arguments, pass a null pointer.
1406 auto PtrTy = OMPIRBuilder->VoidPtr;
1407 if (IfCondition && NumCapturedVars == 0) {
1408 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1409 RealArgs.push_back(NullPtrValue);
1410 }
1411 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1412 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1413
1414 Builder.CreateCall(RTLFn, RealArgs);
1415
1416 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1417 << *Builder.GetInsertBlock()->getParent() << "\n");
1418
1419 // Initialize the local TID stack location with the argument value.
1420 Builder.SetInsertPoint(PrivTID);
1421 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1422 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1423 PrivTIDAddr);
1424
1425 // Remove redundant call to the outlined function.
1426 CI->eraseFromParent();
1427
1428 for (Instruction *I : ToBeDeleted) {
1429 I->eraseFromParent();
1430 }
1431}
1432
1434 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1435 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1436 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1437 omp::ProcBindKind ProcBind, bool IsCancellable) {
1438 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1439
1440 if (!updateToLocation(Loc))
1441 return Loc.IP;
1442
1443 uint32_t SrcLocStrSize;
1444 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1445 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1446 Value *ThreadID = getOrCreateThreadID(Ident);
1447 // If we generate code for the target device, we need to allocate
1448 // struct for aggregate params in the device default alloca address space.
1449 // OpenMP runtime requires that the params of the extracted functions are
1450 // passed as zero address space pointers. This flag ensures that extracted
1451 // function arguments are declared in zero address space
1452 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1453
1454 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1455 // only if we compile for host side.
1456 if (NumThreads && !Config.isTargetDevice()) {
1457 Value *Args[] = {
1458 Ident, ThreadID,
1459 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1461 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1462 }
1463
1464 if (ProcBind != OMP_PROC_BIND_default) {
1465 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1466 Value *Args[] = {
1467 Ident, ThreadID,
1468 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1470 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1471 }
1472
1473 BasicBlock *InsertBB = Builder.GetInsertBlock();
1474 Function *OuterFn = InsertBB->getParent();
1475
1476 // Save the outer alloca block because the insertion iterator may get
1477 // invalidated and we still need this later.
1478 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1479
1480 // Vector to remember instructions we used only during the modeling but which
1481 // we want to delete at the end.
1483
1484 // Change the location to the outer alloca insertion point to create and
1485 // initialize the allocas we pass into the parallel region.
1486 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1487 Builder.restoreIP(NewOuter);
1488 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1489 AllocaInst *ZeroAddrAlloca =
1490 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1491 Instruction *TIDAddr = TIDAddrAlloca;
1492 Instruction *ZeroAddr = ZeroAddrAlloca;
1493 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1494 // Add additional casts to enforce pointers in zero address space
1495 TIDAddr = new AddrSpaceCastInst(
1496 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1497 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1498 ToBeDeleted.push_back(TIDAddr);
1499 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1500 PointerType ::get(M.getContext(), 0),
1501 "zero.addr.ascast");
1502 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1503 ToBeDeleted.push_back(ZeroAddr);
1504 }
1505
1506 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1507 // associated arguments in the outlined function, so we delete them later.
1508 ToBeDeleted.push_back(TIDAddrAlloca);
1509 ToBeDeleted.push_back(ZeroAddrAlloca);
1510
1511 // Create an artificial insertion point that will also ensure the blocks we
1512 // are about to split are not degenerated.
1513 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1514
1515 BasicBlock *EntryBB = UI->getParent();
1516 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1517 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1518 BasicBlock *PRegPreFiniBB =
1519 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1520 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1521
1522 auto FiniCBWrapper = [&](InsertPointTy IP) {
1523 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1524 // target to the region exit block.
1525 if (IP.getBlock()->end() == IP.getPoint()) {
1527 Builder.restoreIP(IP);
1528 Instruction *I = Builder.CreateBr(PRegExitBB);
1529 IP = InsertPointTy(I->getParent(), I->getIterator());
1530 }
1531 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1532 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1533 "Unexpected insertion point for finalization call!");
1534 return FiniCB(IP);
1535 };
1536
1537 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1538
1539 // Generate the privatization allocas in the block that will become the entry
1540 // of the outlined function.
1541 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1542 InsertPointTy InnerAllocaIP = Builder.saveIP();
1543
1544 AllocaInst *PrivTIDAddr =
1545 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1546 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1547
1548 // Add some fake uses for OpenMP provided arguments.
1549 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1550 Instruction *ZeroAddrUse =
1551 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1552 ToBeDeleted.push_back(ZeroAddrUse);
1553
1554 // EntryBB
1555 // |
1556 // V
1557 // PRegionEntryBB <- Privatization allocas are placed here.
1558 // |
1559 // V
1560 // PRegionBodyBB <- BodeGen is invoked here.
1561 // |
1562 // V
1563 // PRegPreFiniBB <- The block we will start finalization from.
1564 // |
1565 // V
1566 // PRegionExitBB <- A common exit to simplify block collection.
1567 //
1568
1569 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1570
1571 // Let the caller create the body.
1572 assert(BodyGenCB && "Expected body generation callback!");
1573 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1574 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1575 return Err;
1576
1577 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1578
1579 OutlineInfo OI;
1580 if (Config.isTargetDevice()) {
1581 // Generate OpenMP target specific runtime call
1582 OI.PostOutlineCB = [=, ToBeDeletedVec =
1583 std::move(ToBeDeleted)](Function &OutlinedFn) {
1584 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1585 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1586 ThreadID, ToBeDeletedVec);
1587 };
1588 } else {
1589 // Generate OpenMP host runtime call
1590 OI.PostOutlineCB = [=, ToBeDeletedVec =
1591 std::move(ToBeDeleted)](Function &OutlinedFn) {
1592 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1593 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1594 };
1595 }
1596
1597 OI.OuterAllocaBB = OuterAllocaBlock;
1598 OI.EntryBB = PRegEntryBB;
1599 OI.ExitBB = PRegExitBB;
1600
1601 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1603 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1604
1605 // Ensure a single exit node for the outlined region by creating one.
1606 // We might have multiple incoming edges to the exit now due to finalizations,
1607 // e.g., cancel calls that cause the control flow to leave the region.
1608 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1609 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1610 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1611 Blocks.push_back(PRegOutlinedExitBB);
1612
1613 CodeExtractorAnalysisCache CEAC(*OuterFn);
1614 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1615 /* AggregateArgs */ false,
1616 /* BlockFrequencyInfo */ nullptr,
1617 /* BranchProbabilityInfo */ nullptr,
1618 /* AssumptionCache */ nullptr,
1619 /* AllowVarArgs */ true,
1620 /* AllowAlloca */ true,
1621 /* AllocationBlock */ OuterAllocaBlock,
1622 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1623
1624 // Find inputs to, outputs from the code region.
1625 BasicBlock *CommonExit = nullptr;
1626 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1627 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1628
1629 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1630 /*CollectGlobalInputs=*/true);
1631
1632 Inputs.remove_if([&](Value *I) {
1633 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1634 return GV->getValueType() == OpenMPIRBuilder::Ident;
1635
1636 return false;
1637 });
1638
1639 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1640
1641 FunctionCallee TIDRTLFn =
1642 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1643
1644 auto PrivHelper = [&](Value &V) -> Error {
1645 if (&V == TIDAddr || &V == ZeroAddr) {
1646 OI.ExcludeArgsFromAggregate.push_back(&V);
1647 return Error::success();
1648 }
1649
1651 for (Use &U : V.uses())
1652 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1653 if (ParallelRegionBlockSet.count(UserI->getParent()))
1654 Uses.insert(&U);
1655
1656 // __kmpc_fork_call expects extra arguments as pointers. If the input
1657 // already has a pointer type, everything is fine. Otherwise, store the
1658 // value onto stack and load it back inside the to-be-outlined region. This
1659 // will ensure only the pointer will be passed to the function.
1660 // FIXME: if there are more than 15 trailing arguments, they must be
1661 // additionally packed in a struct.
1662 Value *Inner = &V;
1663 if (!V.getType()->isPointerTy()) {
1665 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1666
1667 Builder.restoreIP(OuterAllocaIP);
1668 Value *Ptr =
1669 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1670
1671 // Store to stack at end of the block that currently branches to the entry
1672 // block of the to-be-outlined region.
1673 Builder.SetInsertPoint(InsertBB,
1674 InsertBB->getTerminator()->getIterator());
1675 Builder.CreateStore(&V, Ptr);
1676
1677 // Load back next to allocations in the to-be-outlined region.
1678 Builder.restoreIP(InnerAllocaIP);
1679 Inner = Builder.CreateLoad(V.getType(), Ptr);
1680 }
1681
1682 Value *ReplacementValue = nullptr;
1683 CallInst *CI = dyn_cast<CallInst>(&V);
1684 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1685 ReplacementValue = PrivTID;
1686 } else {
1687 InsertPointOrErrorTy AfterIP =
1688 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1689 if (!AfterIP)
1690 return AfterIP.takeError();
1691 Builder.restoreIP(*AfterIP);
1692 InnerAllocaIP = {
1693 InnerAllocaIP.getBlock(),
1694 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1695
1696 assert(ReplacementValue &&
1697 "Expected copy/create callback to set replacement value!");
1698 if (ReplacementValue == &V)
1699 return Error::success();
1700 }
1701
1702 for (Use *UPtr : Uses)
1703 UPtr->set(ReplacementValue);
1704
1705 return Error::success();
1706 };
1707
1708 // Reset the inner alloca insertion as it will be used for loading the values
1709 // wrapped into pointers before passing them into the to-be-outlined region.
1710 // Configure it to insert immediately after the fake use of zero address so
1711 // that they are available in the generated body and so that the
1712 // OpenMP-related values (thread ID and zero address pointers) remain leading
1713 // in the argument list.
1714 InnerAllocaIP = IRBuilder<>::InsertPoint(
1715 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1716
1717 // Reset the outer alloca insertion point to the entry of the relevant block
1718 // in case it was invalidated.
1719 OuterAllocaIP = IRBuilder<>::InsertPoint(
1720 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1721
1722 for (Value *Input : Inputs) {
1723 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1724 if (Error Err = PrivHelper(*Input))
1725 return Err;
1726 }
1727 LLVM_DEBUG({
1728 for (Value *Output : Outputs)
1729 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1730 });
1731 assert(Outputs.empty() &&
1732 "OpenMP outlining should not produce live-out values!");
1733
1734 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1735 LLVM_DEBUG({
1736 for (auto *BB : Blocks)
1737 dbgs() << " PBR: " << BB->getName() << "\n";
1738 });
1739
1740 // Adjust the finalization stack, verify the adjustment, and call the
1741 // finalize function a last time to finalize values between the pre-fini
1742 // block and the exit block if we left the parallel "the normal way".
1743 auto FiniInfo = FinalizationStack.pop_back_val();
1744 (void)FiniInfo;
1745 assert(FiniInfo.DK == OMPD_parallel &&
1746 "Unexpected finalization stack state!");
1747
1748 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1749
1750 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1751 if (Error Err = FiniCB(PreFiniIP))
1752 return Err;
1753
1754 // Register the outlined info.
1755 addOutlineInfo(std::move(OI));
1756
1757 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1758 UI->eraseFromParent();
1759
1760 return AfterIP;
1761}
1762
1764 // Build call void __kmpc_flush(ident_t *loc)
1765 uint32_t SrcLocStrSize;
1766 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1767 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1768
1769 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1770}
1771
1773 if (!updateToLocation(Loc))
1774 return;
1775 emitFlush(Loc);
1776}
1777
1779 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1780 // global_tid);
1781 uint32_t SrcLocStrSize;
1782 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1783 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1784 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1785
1786 // Ignore return result until untied tasks are supported.
1787 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1788 Args);
1789}
1790
1792 if (!updateToLocation(Loc))
1793 return;
1794 emitTaskwaitImpl(Loc);
1795}
1796
1798 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1799 uint32_t SrcLocStrSize;
1800 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1801 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1803 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1804
1805 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1806 Args);
1807}
1808
1810 if (!updateToLocation(Loc))
1811 return;
1812 emitTaskyieldImpl(Loc);
1813}
1814
1815// Processes the dependencies in Dependencies and does the following
1816// - Allocates space on the stack of an array of DependInfo objects
1817// - Populates each DependInfo object with relevant information of
1818// the corresponding dependence.
1819// - All code is inserted in the entry block of the current function.
1821 OpenMPIRBuilder &OMPBuilder,
1823 // Early return if we have no dependencies to process
1824 if (Dependencies.empty())
1825 return nullptr;
1826
1827 // Given a vector of DependData objects, in this function we create an
1828 // array on the stack that holds kmp_dep_info objects corresponding
1829 // to each dependency. This is then passed to the OpenMP runtime.
1830 // For example, if there are 'n' dependencies then the following psedo
1831 // code is generated. Assume the first dependence is on a variable 'a'
1832 //
1833 // \code{c}
1834 // DepArray = alloc(n x sizeof(kmp_depend_info);
1835 // idx = 0;
1836 // DepArray[idx].base_addr = ptrtoint(&a);
1837 // DepArray[idx].len = 8;
1838 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1839 // ++idx;
1840 // DepArray[idx].base_addr = ...;
1841 // \endcode
1842
1843 IRBuilderBase &Builder = OMPBuilder.Builder;
1844 Type *DependInfo = OMPBuilder.DependInfo;
1845 Module &M = OMPBuilder.M;
1846
1847 Value *DepArray = nullptr;
1848 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1849 Builder.SetInsertPoint(
1851
1852 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1853 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1854
1855 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1856 Value *Base =
1857 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1858 // Store the pointer to the variable
1859 Value *Addr = Builder.CreateStructGEP(
1860 DependInfo, Base,
1861 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1862 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1863 Builder.CreateStore(DepValPtr, Addr);
1864 // Store the size of the variable
1865 Value *Size = Builder.CreateStructGEP(
1866 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1867 Builder.CreateStore(
1868 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1869 Size);
1870 // Store the dependency kind
1871 Value *Flags = Builder.CreateStructGEP(
1872 DependInfo, Base,
1873 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1874 Builder.CreateStore(
1875 ConstantInt::get(Builder.getInt8Ty(),
1876 static_cast<unsigned int>(Dep.DepKind)),
1877 Flags);
1878 }
1879 Builder.restoreIP(OldIP);
1880 return DepArray;
1881}
1882
1884 const LocationDescription &Loc, InsertPointTy AllocaIP,
1885 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1886 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1887 Value *Priority) {
1888
1889 if (!updateToLocation(Loc))
1890 return InsertPointTy();
1891
1892 uint32_t SrcLocStrSize;
1893 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1894 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1895 // The current basic block is split into four basic blocks. After outlining,
1896 // they will be mapped as follows:
1897 // ```
1898 // def current_fn() {
1899 // current_basic_block:
1900 // br label %task.exit
1901 // task.exit:
1902 // ; instructions after task
1903 // }
1904 // def outlined_fn() {
1905 // task.alloca:
1906 // br label %task.body
1907 // task.body:
1908 // ret void
1909 // }
1910 // ```
1911 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1912 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1913 BasicBlock *TaskAllocaBB =
1914 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1915
1916 InsertPointTy TaskAllocaIP =
1917 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1918 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1919 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1920 return Err;
1921
1922 OutlineInfo OI;
1923 OI.EntryBB = TaskAllocaBB;
1924 OI.OuterAllocaBB = AllocaIP.getBlock();
1925 OI.ExitBB = TaskExitBB;
1926
1927 // Add the thread ID argument.
1930 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1931
1932 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1933 Mergeable, Priority, EventHandle, TaskAllocaBB,
1934 ToBeDeleted](Function &OutlinedFn) mutable {
1935 // Replace the Stale CI by appropriate RTL function call.
1936 assert(OutlinedFn.getNumUses() == 1 &&
1937 "there must be a single user for the outlined function");
1938 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1939
1940 // HasShareds is true if any variables are captured in the outlined region,
1941 // false otherwise.
1942 bool HasShareds = StaleCI->arg_size() > 1;
1943 Builder.SetInsertPoint(StaleCI);
1944
1945 // Gather the arguments for emitting the runtime call for
1946 // @__kmpc_omp_task_alloc
1947 Function *TaskAllocFn =
1948 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1949
1950 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1951 // call.
1952 Value *ThreadID = getOrCreateThreadID(Ident);
1953
1954 // Argument - `flags`
1955 // Task is tied iff (Flags & 1) == 1.
1956 // Task is untied iff (Flags & 1) == 0.
1957 // Task is final iff (Flags & 2) == 2.
1958 // Task is not final iff (Flags & 2) == 0.
1959 // Task is mergeable iff (Flags & 4) == 4.
1960 // Task is not mergeable iff (Flags & 4) == 0.
1961 // Task is priority iff (Flags & 32) == 32.
1962 // Task is not priority iff (Flags & 32) == 0.
1963 // TODO: Handle the other flags.
1964 Value *Flags = Builder.getInt32(Tied);
1965 if (Final) {
1966 Value *FinalFlag =
1968 Flags = Builder.CreateOr(FinalFlag, Flags);
1969 }
1970
1971 if (Mergeable)
1973 if (Priority)
1975
1976 // Argument - `sizeof_kmp_task_t` (TaskSize)
1977 // Tasksize refers to the size in bytes of kmp_task_t data structure
1978 // including private vars accessed in task.
1979 // TODO: add kmp_task_t_with_privates (privates)
1980 Value *TaskSize = Builder.getInt64(
1982
1983 // Argument - `sizeof_shareds` (SharedsSize)
1984 // SharedsSize refers to the shareds array size in the kmp_task_t data
1985 // structure.
1986 Value *SharedsSize = Builder.getInt64(0);
1987 if (HasShareds) {
1988 AllocaInst *ArgStructAlloca =
1989 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1990 assert(ArgStructAlloca &&
1991 "Unable to find the alloca instruction corresponding to arguments "
1992 "for extracted function");
1993 StructType *ArgStructType =
1994 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1995 assert(ArgStructType && "Unable to find struct type corresponding to "
1996 "arguments for extracted function");
1997 SharedsSize =
1999 }
2000 // Emit the @__kmpc_omp_task_alloc runtime call
2001 // The runtime call returns a pointer to an area where the task captured
2002 // variables must be copied before the task is run (TaskData)
2003 CallInst *TaskData = Builder.CreateCall(
2004 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2005 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2006 /*task_func=*/&OutlinedFn});
2007
2008 // Emit detach clause initialization.
2009 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2010 // task_descriptor);
2011 if (EventHandle) {
2013 OMPRTL___kmpc_task_allow_completion_event);
2014 llvm::Value *EventVal =
2015 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2016 llvm::Value *EventHandleAddr =
2018 Builder.getPtrTy(0));
2019 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2020 Builder.CreateStore(EventVal, EventHandleAddr);
2021 }
2022 // Copy the arguments for outlined function
2023 if (HasShareds) {
2024 Value *Shareds = StaleCI->getArgOperand(1);
2025 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2026 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2027 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2028 SharedsSize);
2029 }
2030
2031 if (Priority) {
2032 //
2033 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2034 // we populate the priority information into the "kmp_task_t" here
2035 //
2036 // The struct "kmp_task_t" definition is available in kmp.h
2037 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2038 // data2 is used for priority
2039 //
2040 Type *Int32Ty = Builder.getInt32Ty();
2041 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2042 // kmp_task_t* => { ptr }
2043 Type *TaskPtr = StructType::get(VoidPtr);
2044 Value *TaskGEP =
2045 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2046 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2047 Type *TaskStructType = StructType::get(
2048 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2049 Value *PriorityData = Builder.CreateInBoundsGEP(
2050 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2051 // kmp_cmplrdata_t => { ptr, ptr }
2052 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2053 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2054 PriorityData, {Zero, Zero});
2055 Builder.CreateStore(Priority, CmplrData);
2056 }
2057
2058 Value *DepArray = nullptr;
2059 if (Dependencies.size()) {
2060 InsertPointTy OldIP = Builder.saveIP();
2062 &OldIP.getBlock()->getParent()->getEntryBlock().back());
2063
2064 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2065 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2066
2067 unsigned P = 0;
2068 for (const DependData &Dep : Dependencies) {
2069 Value *Base =
2070 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
2071 // Store the pointer to the variable
2073 DependInfo, Base,
2074 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2075 Value *DepValPtr =
2077 Builder.CreateStore(DepValPtr, Addr);
2078 // Store the size of the variable
2080 DependInfo, Base,
2081 static_cast<unsigned int>(RTLDependInfoFields::Len));
2083 Dep.DepValueType)),
2084 Size);
2085 // Store the dependency kind
2087 DependInfo, Base,
2088 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2090 ConstantInt::get(Builder.getInt8Ty(),
2091 static_cast<unsigned int>(Dep.DepKind)),
2092 Flags);
2093 ++P;
2094 }
2095
2096 Builder.restoreIP(OldIP);
2097 }
2098
2099 // In the presence of the `if` clause, the following IR is generated:
2100 // ...
2101 // %data = call @__kmpc_omp_task_alloc(...)
2102 // br i1 %if_condition, label %then, label %else
2103 // then:
2104 // call @__kmpc_omp_task(...)
2105 // br label %exit
2106 // else:
2107 // ;; Wait for resolution of dependencies, if any, before
2108 // ;; beginning the task
2109 // call @__kmpc_omp_wait_deps(...)
2110 // call @__kmpc_omp_task_begin_if0(...)
2111 // call @outlined_fn(...)
2112 // call @__kmpc_omp_task_complete_if0(...)
2113 // br label %exit
2114 // exit:
2115 // ...
2116 if (IfCondition) {
2117 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2118 // terminator.
2119 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2120 Instruction *IfTerminator =
2121 Builder.GetInsertPoint()->getParent()->getTerminator();
2122 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2123 Builder.SetInsertPoint(IfTerminator);
2124 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2125 &ElseTI);
2126 Builder.SetInsertPoint(ElseTI);
2127
2128 if (Dependencies.size()) {
2129 Function *TaskWaitFn =
2130 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2132 TaskWaitFn,
2133 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2134 ConstantInt::get(Builder.getInt32Ty(), 0),
2136 }
2137 Function *TaskBeginFn =
2138 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2139 Function *TaskCompleteFn =
2140 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2141 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2142 CallInst *CI = nullptr;
2143 if (HasShareds)
2144 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2145 else
2146 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2147 CI->setDebugLoc(StaleCI->getDebugLoc());
2148 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2149 Builder.SetInsertPoint(ThenTI);
2150 }
2151
2152 if (Dependencies.size()) {
2153 Function *TaskFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2156 TaskFn,
2157 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2158 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2160
2161 } else {
2162 // Emit the @__kmpc_omp_task runtime call to spawn the task
2163 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2164 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2165 }
2166
2167 StaleCI->eraseFromParent();
2168
2169 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2170 if (HasShareds) {
2171 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2172 OutlinedFn.getArg(1)->replaceUsesWithIf(
2173 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2174 }
2175
2176 for (Instruction *I : llvm::reverse(ToBeDeleted))
2177 I->eraseFromParent();
2178 };
2179
2180 addOutlineInfo(std::move(OI));
2181 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2182
2183 return Builder.saveIP();
2184}
2185
2188 InsertPointTy AllocaIP,
2189 BodyGenCallbackTy BodyGenCB) {
2190 if (!updateToLocation(Loc))
2191 return InsertPointTy();
2192
2193 uint32_t SrcLocStrSize;
2194 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2195 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2196 Value *ThreadID = getOrCreateThreadID(Ident);
2197
2198 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2199 Function *TaskgroupFn =
2200 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2201 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2202
2203 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2204 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2205 return Err;
2206
2207 Builder.SetInsertPoint(TaskgroupExitBB);
2208 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2209 Function *EndTaskgroupFn =
2210 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2211 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2212
2213 return Builder.saveIP();
2214}
2215
2217 const LocationDescription &Loc, InsertPointTy AllocaIP,
2219 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2220 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2221
2222 if (!updateToLocation(Loc))
2223 return Loc.IP;
2224
2225 auto FiniCBWrapper = [&](InsertPointTy IP) {
2226 if (IP.getBlock()->end() != IP.getPoint())
2227 return FiniCB(IP);
2228 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2229 // will fail because that function requires the Finalization Basic Block to
2230 // have a terminator, which is already removed by EmitOMPRegionBody.
2231 // IP is currently at cancelation block.
2232 // We need to backtrack to the condition block to fetch
2233 // the exit block and create a branch from cancelation
2234 // to exit block.
2236 Builder.restoreIP(IP);
2237 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2238 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2239 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2240 Instruction *I = Builder.CreateBr(ExitBB);
2241 IP = InsertPointTy(I->getParent(), I->getIterator());
2242 return FiniCB(IP);
2243 };
2244
2245 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2246
2247 // Each section is emitted as a switch case
2248 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2249 // -> OMP.createSection() which generates the IR for each section
2250 // Iterate through all sections and emit a switch construct:
2251 // switch (IV) {
2252 // case 0:
2253 // <SectionStmt[0]>;
2254 // break;
2255 // ...
2256 // case <NumSection> - 1:
2257 // <SectionStmt[<NumSection> - 1]>;
2258 // break;
2259 // }
2260 // ...
2261 // section_loop.after:
2262 // <FiniCB>;
2263 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2264 Builder.restoreIP(CodeGenIP);
2266 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2267 Function *CurFn = Continue->getParent();
2268 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2269
2270 unsigned CaseNumber = 0;
2271 for (auto SectionCB : SectionCBs) {
2273 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2274 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2275 Builder.SetInsertPoint(CaseBB);
2276 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2277 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2278 CaseEndBr->getIterator()}))
2279 return Err;
2280 CaseNumber++;
2281 }
2282 // remove the existing terminator from body BB since there can be no
2283 // terminators after switch/case
2284 return Error::success();
2285 };
2286 // Loop body ends here
2287 // LowerBound, UpperBound, and STride for createCanonicalLoop
2288 Type *I32Ty = Type::getInt32Ty(M.getContext());
2289 Value *LB = ConstantInt::get(I32Ty, 0);
2290 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2291 Value *ST = ConstantInt::get(I32Ty, 1);
2293 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2294 if (!LoopInfo)
2295 return LoopInfo.takeError();
2296
2297 InsertPointOrErrorTy WsloopIP =
2298 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2299 if (!WsloopIP)
2300 return WsloopIP.takeError();
2301 InsertPointTy AfterIP = *WsloopIP;
2302
2303 // Apply the finalization callback in LoopAfterBB
2304 auto FiniInfo = FinalizationStack.pop_back_val();
2305 assert(FiniInfo.DK == OMPD_sections &&
2306 "Unexpected finalization stack state!");
2307 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2308 Builder.restoreIP(AfterIP);
2309 BasicBlock *FiniBB =
2310 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2311 if (Error Err = CB(Builder.saveIP()))
2312 return Err;
2313 AfterIP = {FiniBB, FiniBB->begin()};
2314 }
2315
2316 return AfterIP;
2317}
2318
2321 BodyGenCallbackTy BodyGenCB,
2322 FinalizeCallbackTy FiniCB) {
2323 if (!updateToLocation(Loc))
2324 return Loc.IP;
2325
2326 auto FiniCBWrapper = [&](InsertPointTy IP) {
2327 if (IP.getBlock()->end() != IP.getPoint())
2328 return FiniCB(IP);
2329 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2330 // will fail because that function requires the Finalization Basic Block to
2331 // have a terminator, which is already removed by EmitOMPRegionBody.
2332 // IP is currently at cancelation block.
2333 // We need to backtrack to the condition block to fetch
2334 // the exit block and create a branch from cancelation
2335 // to exit block.
2337 Builder.restoreIP(IP);
2338 auto *CaseBB = Loc.IP.getBlock();
2339 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2340 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2341 Instruction *I = Builder.CreateBr(ExitBB);
2342 IP = InsertPointTy(I->getParent(), I->getIterator());
2343 return FiniCB(IP);
2344 };
2345
2346 Directive OMPD = Directive::OMPD_sections;
2347 // Since we are using Finalization Callback here, HasFinalize
2348 // and IsCancellable have to be true
2349 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2350 /*Conditional*/ false, /*hasFinalize*/ true,
2351 /*IsCancellable*/ true);
2352}
2353
2356 IT++;
2357 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2358}
2359
2360Value *OpenMPIRBuilder::getGPUThreadID() {
2361 return Builder.CreateCall(
2363 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2364 {});
2365}
2366
2367Value *OpenMPIRBuilder::getGPUWarpSize() {
2368 return Builder.CreateCall(
2369 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2370}
2371
2372Value *OpenMPIRBuilder::getNVPTXWarpID() {
2373 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2374 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2375}
2376
2377Value *OpenMPIRBuilder::getNVPTXLaneID() {
2378 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2379 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2380 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2381 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2382 "nvptx_lane_id");
2383}
2384
2385Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2386 Type *ToType) {
2387 Type *FromType = From->getType();
2388 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2389 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2390 assert(FromSize > 0 && "From size must be greater than zero");
2391 assert(ToSize > 0 && "To size must be greater than zero");
2392 if (FromType == ToType)
2393 return From;
2394 if (FromSize == ToSize)
2395 return Builder.CreateBitCast(From, ToType);
2396 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2397 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2398 InsertPointTy SaveIP = Builder.saveIP();
2399 Builder.restoreIP(AllocaIP);
2400 Value *CastItem = Builder.CreateAlloca(ToType);
2401 Builder.restoreIP(SaveIP);
2402
2404 CastItem, Builder.getPtrTy(0));
2405 Builder.CreateStore(From, ValCastItem);
2406 return Builder.CreateLoad(ToType, CastItem);
2407}
2408
2409Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2410 Value *Element,
2411 Type *ElementType,
2412 Value *Offset) {
2413 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2414 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2415
2416 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2417 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2418 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2419 Value *WarpSize =
2420 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2422 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2423 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2424 Value *WarpSizeCast =
2425 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2426 Value *ShuffleCall =
2427 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2428 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2429}
2430
2431void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2432 Value *DstAddr, Type *ElemType,
2433 Value *Offset, Type *ReductionArrayTy) {
2435 // Create the loop over the big sized data.
2436 // ptr = (void*)Elem;
2437 // ptrEnd = (void*) Elem + 1;
2438 // Step = 8;
2439 // while (ptr + Step < ptrEnd)
2440 // shuffle((int64_t)*ptr);
2441 // Step = 4;
2442 // while (ptr + Step < ptrEnd)
2443 // shuffle((int32_t)*ptr);
2444 // ...
2445 Type *IndexTy = Builder.getIndexTy(
2447 Value *ElemPtr = DstAddr;
2448 Value *Ptr = SrcAddr;
2449 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2450 if (Size < IntSize)
2451 continue;
2452 Type *IntType = Builder.getIntNTy(IntSize * 8);
2454 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2455 Value *SrcAddrGEP =
2456 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2458 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2459
2460 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2461 if ((Size / IntSize) > 1) {
2463 SrcAddrGEP, Builder.getPtrTy());
2464 BasicBlock *PreCondBB =
2465 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2466 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2467 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2468 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2469 emitBlock(PreCondBB, CurFunc);
2470 PHINode *PhiSrc =
2471 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2472 PhiSrc->addIncoming(Ptr, CurrentBB);
2473 PHINode *PhiDest =
2474 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2475 PhiDest->addIncoming(ElemPtr, CurrentBB);
2476 Ptr = PhiSrc;
2477 ElemPtr = PhiDest;
2478 Value *PtrDiff = Builder.CreatePtrDiff(
2479 Builder.getInt8Ty(), PtrEnd,
2482 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2483 ExitBB);
2484 emitBlock(ThenBB, CurFunc);
2485 Value *Res = createRuntimeShuffleFunction(
2486 AllocaIP,
2488 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2489 IntType, Offset);
2490 Builder.CreateAlignedStore(Res, ElemPtr,
2491 M.getDataLayout().getPrefTypeAlign(ElemType));
2492 Value *LocalPtr =
2493 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2494 Value *LocalElemPtr =
2495 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2496 PhiSrc->addIncoming(LocalPtr, ThenBB);
2497 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2498 emitBranch(PreCondBB);
2499 emitBlock(ExitBB, CurFunc);
2500 } else {
2501 Value *Res = createRuntimeShuffleFunction(
2502 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2503 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2504 Res->getType()->getScalarSizeInBits())
2505 Res = Builder.CreateTrunc(Res, ElemType);
2506 Builder.CreateStore(Res, ElemPtr);
2507 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2508 ElemPtr =
2509 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2510 }
2511 Size = Size % IntSize;
2512 }
2513}
2514
2515void OpenMPIRBuilder::emitReductionListCopy(
2516 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2517 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2518 CopyOptionsTy CopyOptions) {
2519 Type *IndexTy = Builder.getIndexTy(
2521 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2522
2523 // Iterates, element-by-element, through the source Reduce list and
2524 // make a copy.
2525 for (auto En : enumerate(ReductionInfos)) {
2526 const ReductionInfo &RI = En.value();
2527 Value *SrcElementAddr = nullptr;
2528 Value *DestElementAddr = nullptr;
2529 Value *DestElementPtrAddr = nullptr;
2530 // Should we shuffle in an element from a remote lane?
2531 bool ShuffleInElement = false;
2532 // Set to true to update the pointer in the dest Reduce list to a
2533 // newly created element.
2534 bool UpdateDestListPtr = false;
2535
2536 // Step 1.1: Get the address for the src element in the Reduce list.
2537 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2538 ReductionArrayTy, SrcBase,
2539 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2540 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2541
2542 // Step 1.2: Create a temporary to store the element in the destination
2543 // Reduce list.
2544 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2545 ReductionArrayTy, DestBase,
2546 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2547 switch (Action) {
2549 InsertPointTy CurIP = Builder.saveIP();
2550 Builder.restoreIP(AllocaIP);
2551 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2552 ".omp.reduction.element");
2553 DestAlloca->setAlignment(
2554 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2555 DestElementAddr = DestAlloca;
2556 DestElementAddr =
2557 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2558 DestElementAddr->getName() + ".ascast");
2559 Builder.restoreIP(CurIP);
2560 ShuffleInElement = true;
2561 UpdateDestListPtr = true;
2562 break;
2563 }
2565 DestElementAddr =
2566 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2567 break;
2568 }
2569 }
2570
2571 // Now that all active lanes have read the element in the
2572 // Reduce list, shuffle over the value from the remote lane.
2573 if (ShuffleInElement) {
2574 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2575 RemoteLaneOffset, ReductionArrayTy);
2576 } else {
2577 switch (RI.EvaluationKind) {
2578 case EvalKind::Scalar: {
2579 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2580 // Store the source element value to the dest element address.
2581 Builder.CreateStore(Elem, DestElementAddr);
2582 break;
2583 }
2584 case EvalKind::Complex: {
2586 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2587 Value *SrcReal = Builder.CreateLoad(
2588 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2590 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2591 Value *SrcImg = Builder.CreateLoad(
2592 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2593
2595 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2597 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2598 Builder.CreateStore(SrcReal, DestRealPtr);
2599 Builder.CreateStore(SrcImg, DestImgPtr);
2600 break;
2601 }
2602 case EvalKind::Aggregate: {
2603 Value *SizeVal = Builder.getInt64(
2604 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2606 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2607 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2608 SizeVal, false);
2609 break;
2610 }
2611 };
2612 }
2613
2614 // Step 3.1: Modify reference in dest Reduce list as needed.
2615 // Modifying the reference in Reduce list to point to the newly
2616 // created element. The element is live in the current function
2617 // scope and that of functions it invokes (i.e., reduce_function).
2618 // RemoteReduceData[i] = (void*)&RemoteElem
2619 if (UpdateDestListPtr) {
2621 DestElementAddr, Builder.getPtrTy(),
2622 DestElementAddr->getName() + ".ascast");
2623 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2624 }
2625 }
2626}
2627
2628Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2629 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2630 AttributeList FuncAttrs) {
2631 InsertPointTy SavedIP = Builder.saveIP();
2632 LLVMContext &Ctx = M.getContext();
2634 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2635 /* IsVarArg */ false);
2636 Function *WcFunc =
2638 "_omp_reduction_inter_warp_copy_func", &M);
2639 WcFunc->setAttributes(FuncAttrs);
2640 WcFunc->addParamAttr(0, Attribute::NoUndef);
2641 WcFunc->addParamAttr(1, Attribute::NoUndef);
2642 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2643 Builder.SetInsertPoint(EntryBB);
2644
2645 // ReduceList: thread local Reduce list.
2646 // At the stage of the computation when this function is called, partially
2647 // aggregated values reside in the first lane of every active warp.
2648 Argument *ReduceListArg = WcFunc->getArg(0);
2649 // NumWarps: number of warps active in the parallel region. This could
2650 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2651 Argument *NumWarpsArg = WcFunc->getArg(1);
2652
2653 // This array is used as a medium to transfer, one reduce element at a time,
2654 // the data from the first lane of every warp to lanes in the first warp
2655 // in order to perform the final step of a reduction in a parallel region
2656 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2657 // for reduced latency, as well as to have a distinct copy for concurrently
2658 // executing target regions. The array is declared with common linkage so
2659 // as to be shared across compilation units.
2660 StringRef TransferMediumName =
2661 "__openmp_nvptx_data_transfer_temporary_storage";
2662 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2663 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2664 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2665 if (!TransferMedium) {
2666 TransferMedium = new GlobalVariable(
2667 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2668 UndefValue::get(ArrayTy), TransferMediumName,
2669 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2670 /*AddressSpace=*/3);
2671 }
2672
2673 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2674 Value *GPUThreadID = getGPUThreadID();
2675 // nvptx_lane_id = nvptx_id % warpsize
2676 Value *LaneID = getNVPTXLaneID();
2677 // nvptx_warp_id = nvptx_id / warpsize
2678 Value *WarpID = getNVPTXWarpID();
2679
2680 InsertPointTy AllocaIP =
2683 Type *Arg0Type = ReduceListArg->getType();
2684 Type *Arg1Type = NumWarpsArg->getType();
2685 Builder.restoreIP(AllocaIP);
2686 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2687 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2688 AllocaInst *NumWarpsAlloca =
2689 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2691 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2693 NumWarpsAlloca, Builder.getPtrTy(0),
2694 NumWarpsAlloca->getName() + ".ascast");
2695 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2696 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2697 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2698 InsertPointTy CodeGenIP =
2700 Builder.restoreIP(CodeGenIP);
2701
2702 Value *ReduceList =
2703 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2704
2705 for (auto En : enumerate(ReductionInfos)) {
2706 //
2707 // Warp master copies reduce element to transfer medium in __shared__
2708 // memory.
2709 //
2710 const ReductionInfo &RI = En.value();
2711 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2712 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2713 Type *CType = Builder.getIntNTy(TySize * 8);
2714
2715 unsigned NumIters = RealTySize / TySize;
2716 if (NumIters == 0)
2717 continue;
2718 Value *Cnt = nullptr;
2719 Value *CntAddr = nullptr;
2720 BasicBlock *PrecondBB = nullptr;
2721 BasicBlock *ExitBB = nullptr;
2722 if (NumIters > 1) {
2723 CodeGenIP = Builder.saveIP();
2724 Builder.restoreIP(AllocaIP);
2725 CntAddr =
2726 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2727
2728 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2729 CntAddr->getName() + ".ascast");
2730 Builder.restoreIP(CodeGenIP);
2732 CntAddr,
2733 /*Volatile=*/false);
2734 PrecondBB = BasicBlock::Create(Ctx, "precond");
2735 ExitBB = BasicBlock::Create(Ctx, "exit");
2736 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2737 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2738 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2739 /*Volatile=*/false);
2741 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2742 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2744 }
2745
2746 // kmpc_barrier.
2747 InsertPointOrErrorTy BarrierIP1 =
2748 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2749 omp::Directive::OMPD_unknown,
2750 /* ForceSimpleCall */ false,
2751 /* CheckCancelFlag */ true);
2752 if (!BarrierIP1)
2753 return BarrierIP1.takeError();
2754 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2755 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2756 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2757
2758 // if (lane_id == 0)
2759 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2760 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2762
2763 // Reduce element = LocalReduceList[i]
2764 auto *RedListArrayTy =
2765 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2766 Type *IndexTy = Builder.getIndexTy(
2768 Value *ElemPtrPtr =
2769 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2770 {ConstantInt::get(IndexTy, 0),
2771 ConstantInt::get(IndexTy, En.index())});
2772 // elemptr = ((CopyType*)(elemptrptr)) + I
2773 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2774 if (NumIters > 1)
2775 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2776
2777 // Get pointer to location in transfer medium.
2778 // MediumPtr = &medium[warp_id]
2779 Value *MediumPtr = Builder.CreateInBoundsGEP(
2780 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2781 // elem = *elemptr
2782 //*MediumPtr = elem
2783 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2784 // Store the source element value to the dest element address.
2785 Builder.CreateStore(Elem, MediumPtr,
2786 /*IsVolatile*/ true);
2787 Builder.CreateBr(MergeBB);
2788
2789 // else
2791 Builder.CreateBr(MergeBB);
2792
2793 // endif
2795 InsertPointOrErrorTy BarrierIP2 =
2796 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2797 omp::Directive::OMPD_unknown,
2798 /* ForceSimpleCall */ false,
2799 /* CheckCancelFlag */ true);
2800 if (!BarrierIP2)
2801 return BarrierIP2.takeError();
2802
2803 // Warp 0 copies reduce element from transfer medium
2804 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2805 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2806 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2807
2808 Value *NumWarpsVal =
2809 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2810 // Up to 32 threads in warp 0 are active.
2811 Value *IsActiveThread =
2812 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2813 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2814
2815 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2816
2817 // SecMediumPtr = &medium[tid]
2818 // SrcMediumVal = *SrcMediumPtr
2819 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2820 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2821 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2822 Value *TargetElemPtrPtr =
2823 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2824 {ConstantInt::get(IndexTy, 0),
2825 ConstantInt::get(IndexTy, En.index())});
2826 Value *TargetElemPtrVal =
2827 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2828 Value *TargetElemPtr = TargetElemPtrVal;
2829 if (NumIters > 1)
2830 TargetElemPtr =
2831 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2832
2833 // *TargetElemPtr = SrcMediumVal;
2834 Value *SrcMediumValue =
2835 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2836 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2837 Builder.CreateBr(W0MergeBB);
2838
2839 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2840 Builder.CreateBr(W0MergeBB);
2841
2842 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2843
2844 if (NumIters > 1) {
2845 Cnt = Builder.CreateNSWAdd(
2846 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2847 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2848
2849 auto *CurFn = Builder.GetInsertBlock()->getParent();
2850 emitBranch(PrecondBB);
2851 emitBlock(ExitBB, CurFn);
2852 }
2853 RealTySize %= TySize;
2854 }
2855 }
2856
2858 Builder.restoreIP(SavedIP);
2859
2860 return WcFunc;
2861}
2862
2863Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2864 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2865 AttributeList FuncAttrs) {
2866 LLVMContext &Ctx = M.getContext();
2867 FunctionType *FuncTy =
2869 {Builder.getPtrTy(), Builder.getInt16Ty(),
2870 Builder.getInt16Ty(), Builder.getInt16Ty()},
2871 /* IsVarArg */ false);
2872 Function *SarFunc =
2874 "_omp_reduction_shuffle_and_reduce_func", &M);
2875 SarFunc->setAttributes(FuncAttrs);
2876 SarFunc->addParamAttr(0, Attribute::NoUndef);
2877 SarFunc->addParamAttr(1, Attribute::NoUndef);
2878 SarFunc->addParamAttr(2, Attribute::NoUndef);
2879 SarFunc->addParamAttr(3, Attribute::NoUndef);
2880 SarFunc->addParamAttr(1, Attribute::SExt);
2881 SarFunc->addParamAttr(2, Attribute::SExt);
2882 SarFunc->addParamAttr(3, Attribute::SExt);
2883 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2884 Builder.SetInsertPoint(EntryBB);
2885
2886 // Thread local Reduce list used to host the values of data to be reduced.
2887 Argument *ReduceListArg = SarFunc->getArg(0);
2888 // Current lane id; could be logical.
2889 Argument *LaneIDArg = SarFunc->getArg(1);
2890 // Offset of the remote source lane relative to the current lane.
2891 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2892 // Algorithm version. This is expected to be known at compile time.
2893 Argument *AlgoVerArg = SarFunc->getArg(3);
2894
2895 Type *ReduceListArgType = ReduceListArg->getType();
2896 Type *LaneIDArgType = LaneIDArg->getType();
2897 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2898 Value *ReduceListAlloca = Builder.CreateAlloca(
2899 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2900 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2901 LaneIDArg->getName() + ".addr");
2902 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2903 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2904 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2905 AlgoVerArg->getName() + ".addr");
2906 ArrayType *RedListArrayTy =
2907 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2908
2909 // Create a local thread-private variable to host the Reduce list
2910 // from a remote lane.
2911 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2912 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2913
2915 ReduceListAlloca, ReduceListArgType,
2916 ReduceListAlloca->getName() + ".ascast");
2918 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2919 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2920 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2921 RemoteLaneOffsetAlloca->getName() + ".ascast");
2923 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2925 RemoteReductionListAlloca, Builder.getPtrTy(),
2926 RemoteReductionListAlloca->getName() + ".ascast");
2927
2928 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2929 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2930 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2931 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2932
2933 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2934 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2935 Value *RemoteLaneOffset =
2936 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2937 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2938
2939 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2940
2941 // This loop iterates through the list of reduce elements and copies,
2942 // element by element, from a remote lane in the warp to RemoteReduceList,
2943 // hosted on the thread's stack.
2944 emitReductionListCopy(
2945 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2946 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2947
2948 // The actions to be performed on the Remote Reduce list is dependent
2949 // on the algorithm version.
2950 //
2951 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2952 // LaneId % 2 == 0 && Offset > 0):
2953 // do the reduction value aggregation
2954 //
2955 // The thread local variable Reduce list is mutated in place to host the
2956 // reduced data, which is the aggregated value produced from local and
2957 // remote lanes.
2958 //
2959 // Note that AlgoVer is expected to be a constant integer known at compile
2960 // time.
2961 // When AlgoVer==0, the first conjunction evaluates to true, making
2962 // the entire predicate true during compile time.
2963 // When AlgoVer==1, the second conjunction has only the second part to be
2964 // evaluated during runtime. Other conjunctions evaluates to false
2965 // during compile time.
2966 // When AlgoVer==2, the third conjunction has only the second part to be
2967 // evaluated during runtime. Other conjunctions evaluates to false
2968 // during compile time.
2969 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2970 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2971 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2972 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2973 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2974 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2975 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2976 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2977 Value *RemoteOffsetComp =
2978 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2979 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2980 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2981 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2982
2983 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2984 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2985 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2986
2987 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2990 ReduceList, Builder.getPtrTy());
2991 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2992 RemoteListAddrCast, Builder.getPtrTy());
2993 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2994 ->addFnAttr(Attribute::NoUnwind);
2995 Builder.CreateBr(MergeBB);
2996
2998 Builder.CreateBr(MergeBB);
2999
3001
3002 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3003 // Reduce list.
3004 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3005 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3006 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3007
3008 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3009 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3010 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3011 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3012
3013 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3014 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3015 ReductionInfos, RemoteListAddrCast, ReduceList);
3016 Builder.CreateBr(CpyMergeBB);
3017
3018 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3019 Builder.CreateBr(CpyMergeBB);
3020
3021 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3022
3024
3025 return SarFunc;
3026}
3027
3028Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3029 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3030 AttributeList FuncAttrs) {
3032 LLVMContext &Ctx = M.getContext();
3035 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3036 /* IsVarArg */ false);
3037 Function *LtGCFunc =
3039 "_omp_reduction_list_to_global_copy_func", &M);
3040 LtGCFunc->setAttributes(FuncAttrs);
3041 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3042 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3043 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3044
3045 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3046 Builder.SetInsertPoint(EntryBlock);
3047
3048 // Buffer: global reduction buffer.
3049 Argument *BufferArg = LtGCFunc->getArg(0);
3050 // Idx: index of the buffer.
3051 Argument *IdxArg = LtGCFunc->getArg(1);
3052 // ReduceList: thread local Reduce list.
3053 Argument *ReduceListArg = LtGCFunc->getArg(2);
3054
3055 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3056 BufferArg->getName() + ".addr");
3057 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3058 IdxArg->getName() + ".addr");
3059 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3060 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3062 BufferArgAlloca, Builder.getPtrTy(),
3063 BufferArgAlloca->getName() + ".ascast");
3065 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3066 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3067 ReduceListArgAlloca, Builder.getPtrTy(),
3068 ReduceListArgAlloca->getName() + ".ascast");
3069
3070 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3071 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3072 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3073
3074 Value *LocalReduceList =
3075 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3076 Value *BufferArgVal =
3077 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3078 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3079 Type *IndexTy = Builder.getIndexTy(
3081 for (auto En : enumerate(ReductionInfos)) {
3082 const ReductionInfo &RI = En.value();
3083 auto *RedListArrayTy =
3084 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3085 // Reduce element = LocalReduceList[i]
3086 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3087 RedListArrayTy, LocalReduceList,
3088 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3089 // elemptr = ((CopyType*)(elemptrptr)) + I
3090 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3091
3092 // Global = Buffer.VD[Idx];
3093 Value *BufferVD =
3094 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3096 ReductionsBufferTy, BufferVD, 0, En.index());
3097
3098 switch (RI.EvaluationKind) {
3099 case EvalKind::Scalar: {
3100 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3101 Builder.CreateStore(TargetElement, GlobVal);
3102 break;
3103 }
3104 case EvalKind::Complex: {
3106 RI.ElementType, ElemPtr, 0, 0, ".realp");
3107 Value *SrcReal = Builder.CreateLoad(
3108 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3110 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3111 Value *SrcImg = Builder.CreateLoad(
3112 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3113
3115 RI.ElementType, GlobVal, 0, 0, ".realp");
3117 RI.ElementType, GlobVal, 0, 1, ".imagp");
3118 Builder.CreateStore(SrcReal, DestRealPtr);
3119 Builder.CreateStore(SrcImg, DestImgPtr);
3120 break;
3121 }
3122 case EvalKind::Aggregate: {
3123 Value *SizeVal =
3124 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3126 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3127 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3128 break;
3129 }
3130 }
3131 }
3132
3134 Builder.restoreIP(OldIP);
3135 return LtGCFunc;
3136}
3137
3138Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3139 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3140 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3142 LLVMContext &Ctx = M.getContext();
3145 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3146 /* IsVarArg */ false);
3147 Function *LtGRFunc =
3149 "_omp_reduction_list_to_global_reduce_func", &M);
3150 LtGRFunc->setAttributes(FuncAttrs);
3151 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3152 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3153 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3154
3155 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3156 Builder.SetInsertPoint(EntryBlock);
3157
3158 // Buffer: global reduction buffer.
3159 Argument *BufferArg = LtGRFunc->getArg(0);
3160 // Idx: index of the buffer.
3161 Argument *IdxArg = LtGRFunc->getArg(1);
3162 // ReduceList: thread local Reduce list.
3163 Argument *ReduceListArg = LtGRFunc->getArg(2);
3164
3165 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3166 BufferArg->getName() + ".addr");
3167 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3168 IdxArg->getName() + ".addr");
3169 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3170 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3171 auto *RedListArrayTy =
3172 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3173
3174 // 1. Build a list of reduction variables.
3175 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3176 Value *LocalReduceList =
3177 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3178
3180 BufferArgAlloca, Builder.getPtrTy(),
3181 BufferArgAlloca->getName() + ".ascast");
3183 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3184 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3185 ReduceListArgAlloca, Builder.getPtrTy(),
3186 ReduceListArgAlloca->getName() + ".ascast");
3187 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3188 LocalReduceList, Builder.getPtrTy(),
3189 LocalReduceList->getName() + ".ascast");
3190
3191 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3192 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3193 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3194
3195 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3196 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3197 Type *IndexTy = Builder.getIndexTy(
3199 for (auto En : enumerate(ReductionInfos)) {
3200 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3201 RedListArrayTy, LocalReduceListAddrCast,
3202 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3203 Value *BufferVD =
3204 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3205 // Global = Buffer.VD[Idx];
3207 ReductionsBufferTy, BufferVD, 0, En.index());
3208 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3209 }
3210
3211 // Call reduce_function(GlobalReduceList, ReduceList)
3212 Value *ReduceList =
3213 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3214 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3215 ->addFnAttr(Attribute::NoUnwind);
3217 Builder.restoreIP(OldIP);
3218 return LtGRFunc;
3219}
3220
3221Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3222 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3223 AttributeList FuncAttrs) {
3225 LLVMContext &Ctx = M.getContext();
3228 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3229 /* IsVarArg */ false);
3230 Function *LtGCFunc =
3232 "_omp_reduction_global_to_list_copy_func", &M);
3233 LtGCFunc->setAttributes(FuncAttrs);
3234 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3235 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3236 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3237
3238 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3239 Builder.SetInsertPoint(EntryBlock);
3240
3241 // Buffer: global reduction buffer.
3242 Argument *BufferArg = LtGCFunc->getArg(0);
3243 // Idx: index of the buffer.
3244 Argument *IdxArg = LtGCFunc->getArg(1);
3245 // ReduceList: thread local Reduce list.
3246 Argument *ReduceListArg = LtGCFunc->getArg(2);
3247
3248 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3249 BufferArg->getName() + ".addr");
3250 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3251 IdxArg->getName() + ".addr");
3252 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3253 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3255 BufferArgAlloca, Builder.getPtrTy(),
3256 BufferArgAlloca->getName() + ".ascast");
3258 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3259 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3260 ReduceListArgAlloca, Builder.getPtrTy(),
3261 ReduceListArgAlloca->getName() + ".ascast");
3262 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3263 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3264 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3265
3266 Value *LocalReduceList =
3267 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3268 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3269 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3270 Type *IndexTy = Builder.getIndexTy(
3272 for (auto En : enumerate(ReductionInfos)) {
3273 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3274 auto *RedListArrayTy =
3275 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3276 // Reduce element = LocalReduceList[i]
3277 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3278 RedListArrayTy, LocalReduceList,
3279 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3280 // elemptr = ((CopyType*)(elemptrptr)) + I
3281 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3282 // Global = Buffer.VD[Idx];
3283 Value *BufferVD =
3284 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3286 ReductionsBufferTy, BufferVD, 0, En.index());
3287
3288 switch (RI.EvaluationKind) {
3289 case EvalKind::Scalar: {
3290 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3291 Builder.CreateStore(TargetElement, ElemPtr);
3292 break;
3293 }
3294 case EvalKind::Complex: {
3296 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3297 Value *SrcReal = Builder.CreateLoad(
3298 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3300 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3301 Value *SrcImg = Builder.CreateLoad(
3302 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3303
3305 RI.ElementType, ElemPtr, 0, 0, ".realp");
3307 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3308 Builder.CreateStore(SrcReal, DestRealPtr);
3309 Builder.CreateStore(SrcImg, DestImgPtr);
3310 break;
3311 }
3312 case EvalKind::Aggregate: {
3313 Value *SizeVal =
3317 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3318 SizeVal, false);
3319 break;
3320 }
3321 }
3322 }
3323
3325 Builder.restoreIP(OldIP);
3326 return LtGCFunc;
3327}
3328
3329Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3330 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3331 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3333 LLVMContext &Ctx = M.getContext();
3334 auto *FuncTy = FunctionType::get(
3336 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3337 /* IsVarArg */ false);
3338 Function *LtGRFunc =
3340 "_omp_reduction_global_to_list_reduce_func", &M);
3341 LtGRFunc->setAttributes(FuncAttrs);
3342 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3343 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3344 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3345
3346 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3347 Builder.SetInsertPoint(EntryBlock);
3348
3349 // Buffer: global reduction buffer.
3350 Argument *BufferArg = LtGRFunc->getArg(0);
3351 // Idx: index of the buffer.
3352 Argument *IdxArg = LtGRFunc->getArg(1);
3353 // ReduceList: thread local Reduce list.
3354 Argument *ReduceListArg = LtGRFunc->getArg(2);
3355
3356 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3357 BufferArg->getName() + ".addr");
3358 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3359 IdxArg->getName() + ".addr");
3360 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3361 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3362 ArrayType *RedListArrayTy =
3363 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3364
3365 // 1. Build a list of reduction variables.
3366 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3367 Value *LocalReduceList =
3368 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3369
3371 BufferArgAlloca, Builder.getPtrTy(),
3372 BufferArgAlloca->getName() + ".ascast");
3374 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3375 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3376 ReduceListArgAlloca, Builder.getPtrTy(),
3377 ReduceListArgAlloca->getName() + ".ascast");
3379 LocalReduceList, Builder.getPtrTy(),
3380 LocalReduceList->getName() + ".ascast");
3381
3382 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3383 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3384 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3385
3386 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3387 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3388 Type *IndexTy = Builder.getIndexTy(
3390 for (auto En : enumerate(ReductionInfos)) {
3391 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3392 RedListArrayTy, ReductionList,
3393 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3394 // Global = Buffer.VD[Idx];
3395 Value *BufferVD =
3396 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3398 ReductionsBufferTy, BufferVD, 0, En.index());
3399 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3400 }
3401
3402 // Call reduce_function(ReduceList, GlobalReduceList)
3403 Value *ReduceList =
3404 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3405 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3406 ->addFnAttr(Attribute::NoUnwind);
3408 Builder.restoreIP(OldIP);
3409 return LtGRFunc;
3410}
3411
3412std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3413 std::string Suffix =
3414 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3415 return (Name + Suffix).str();
3416}
3417
3418Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3419 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3420 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3421 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3422 {Builder.getPtrTy(), Builder.getPtrTy()},
3423 /* IsVarArg */ false);
3424 std::string Name = getReductionFuncName(ReducerName);
3425 Function *ReductionFunc =
3427 ReductionFunc->setAttributes(FuncAttrs);
3428 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3429 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3430 BasicBlock *EntryBB =
3431 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3432 Builder.SetInsertPoint(EntryBB);
3433
3434 // Need to alloca memory here and deal with the pointers before getting
3435 // LHS/RHS pointers out
3436 Value *LHSArrayPtr = nullptr;
3437 Value *RHSArrayPtr = nullptr;
3438 Argument *Arg0 = ReductionFunc->getArg(0);
3439 Argument *Arg1 = ReductionFunc->getArg(1);
3440 Type *Arg0Type = Arg0->getType();
3441 Type *Arg1Type = Arg1->getType();
3442
3443 Value *LHSAlloca =
3444 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3445 Value *RHSAlloca =
3446 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3448 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3450 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3451 Builder.CreateStore(Arg0, LHSAddrCast);
3452 Builder.CreateStore(Arg1, RHSAddrCast);
3453 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3454 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3455
3456 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3457 Type *IndexTy = Builder.getIndexTy(
3459 SmallVector<Value *> LHSPtrs, RHSPtrs;
3460 for (auto En : enumerate(ReductionInfos)) {
3461 const ReductionInfo &RI = En.value();
3462 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3463 RedArrayTy, RHSArrayPtr,
3464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3465 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3467 RHSI8Ptr, RI.PrivateVariable->getType(),
3468 RHSI8Ptr->getName() + ".ascast");
3469
3470 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3471 RedArrayTy, LHSArrayPtr,
3472 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3473 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3475 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3476
3478 LHSPtrs.emplace_back(LHSPtr);
3479 RHSPtrs.emplace_back(RHSPtr);
3480 } else {
3481 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3482 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3483 Value *Reduced;
3484 InsertPointOrErrorTy AfterIP =
3485 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3486 if (!AfterIP)
3487 return AfterIP.takeError();
3488 if (!Builder.GetInsertBlock())
3489 return ReductionFunc;
3490 Builder.CreateStore(Reduced, LHSPtr);
3491 }
3492 }
3493
3495 for (auto En : enumerate(ReductionInfos)) {
3496 unsigned Index = En.index();
3497 const ReductionInfo &RI = En.value();
3498 Value *LHSFixupPtr, *RHSFixupPtr;
3499 Builder.restoreIP(RI.ReductionGenClang(
3500 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3501
3502 // Fix the CallBack code genereated to use the correct Values for the LHS
3503 // and RHS
3504 LHSFixupPtr->replaceUsesWithIf(
3505 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3506 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3507 ReductionFunc;
3508 });
3509 RHSFixupPtr->replaceUsesWithIf(
3510 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3511 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3512 ReductionFunc;
3513 });
3514 }
3515
3517 return ReductionFunc;
3518}
3519
3520static void
3522 bool IsGPU) {
3523 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3524 (void)RI;
3525 assert(RI.Variable && "expected non-null variable");
3526 assert(RI.PrivateVariable && "expected non-null private variable");
3527 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3528 "expected non-null reduction generator callback");
3529 if (!IsGPU) {
3530 assert(
3531 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3532 "expected variables and their private equivalents to have the same "
3533 "type");
3534 }
3535 assert(RI.Variable->getType()->isPointerTy() &&
3536 "expected variables to be pointers");
3537 }
3538}
3539
3541 const LocationDescription &Loc, InsertPointTy AllocaIP,
3542 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3543 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3544 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3545 unsigned ReductionBufNum, Value *SrcLocInfo) {
3546 if (!updateToLocation(Loc))
3547 return InsertPointTy();
3548 Builder.restoreIP(CodeGenIP);
3549 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3550 LLVMContext &Ctx = M.getContext();
3551
3552 // Source location for the ident struct
3553 if (!SrcLocInfo) {
3554 uint32_t SrcLocStrSize;
3555 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3556 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3557 }
3558
3559 if (ReductionInfos.size() == 0)
3560 return Builder.saveIP();
3561
3562 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3563 AttributeList FuncAttrs;
3564 AttrBuilder AttrBldr(Ctx);
3565 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3566 AttrBldr.addAttribute(Attr);
3567 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3568 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3569
3570 CodeGenIP = Builder.saveIP();
3571 Expected<Function *> ReductionResult =
3572 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3573 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3574 if (!ReductionResult)
3575 return ReductionResult.takeError();
3576 Function *ReductionFunc = *ReductionResult;
3577 Builder.restoreIP(CodeGenIP);
3578
3579 // Set the grid value in the config needed for lowering later on
3580 if (GridValue.has_value())
3581 Config.setGridValue(GridValue.value());
3582 else
3583 Config.setGridValue(getGridValue(T, ReductionFunc));
3584
3585 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3586 // RedList, shuffle_reduce_func, interwarp_copy_func);
3587 // or
3588 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3589 Value *Res;
3590
3591 // 1. Build a list of reduction variables.
3592 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3593 auto Size = ReductionInfos.size();
3594 Type *PtrTy = PointerType::getUnqual(Ctx);
3595 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3596 CodeGenIP = Builder.saveIP();
3597 Builder.restoreIP(AllocaIP);
3598 Value *ReductionListAlloca =
3599 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3601 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3602 Builder.restoreIP(CodeGenIP);
3603 Type *IndexTy = Builder.getIndexTy(
3605 for (auto En : enumerate(ReductionInfos)) {
3606 const ReductionInfo &RI = En.value();
3607 Value *ElemPtr = Builder.CreateInBoundsGEP(
3608 RedArrayTy, ReductionList,
3609 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3610 Value *CastElem =
3612 Builder.CreateStore(CastElem, ElemPtr);
3613 }
3614 CodeGenIP = Builder.saveIP();
3615 Function *SarFunc =
3616 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3617 Expected<Function *> CopyResult =
3618 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3619 if (!CopyResult)
3620 return CopyResult.takeError();
3621 Function *WcFunc = *CopyResult;
3622 Builder.restoreIP(CodeGenIP);
3623
3624 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3625
3626 unsigned MaxDataSize = 0;
3627 SmallVector<Type *> ReductionTypeArgs;
3628 for (auto En : enumerate(ReductionInfos)) {
3629 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3630 if (Size > MaxDataSize)
3631 MaxDataSize = Size;
3632 ReductionTypeArgs.emplace_back(En.value().ElementType);
3633 }
3634 Value *ReductionDataSize =
3635 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3636 if (!IsTeamsReduction) {
3637 Value *SarFuncCast =
3639 Value *WcFuncCast =
3641 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3642 WcFuncCast};
3644 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3645 Res = Builder.CreateCall(Pv2Ptr, Args);
3646 } else {
3647 CodeGenIP = Builder.saveIP();
3648 StructType *ReductionsBufferTy = StructType::create(
3649 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3650 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3651 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3652 Function *LtGCFunc = emitListToGlobalCopyFunction(
3653 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3654 Function *LtGRFunc = emitListToGlobalReduceFunction(
3655 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3656 Function *GtLCFunc = emitGlobalToListCopyFunction(
3657 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3658 Function *GtLRFunc = emitGlobalToListReduceFunction(
3659 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3660 Builder.restoreIP(CodeGenIP);
3661
3662 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3663 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3664
3665 Value *Args3[] = {SrcLocInfo,
3666 KernelTeamsReductionPtr,
3667 Builder.getInt32(ReductionBufNum),
3668 ReductionDataSize,
3669 RL,
3670 SarFunc,
3671 WcFunc,
3672 LtGCFunc,
3673 LtGRFunc,
3674 GtLCFunc,
3675 GtLRFunc};
3676
3677 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3678 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3679 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3680 }
3681
3682 // 5. Build if (res == 1)
3683 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3684 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3686 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3687
3688 // 6. Build then branch: where we have reduced values in the master
3689 // thread in each team.
3690 // __kmpc_end_reduce{_nowait}(<gtid>);
3691 // break;
3692 emitBlock(ThenBB, CurFunc);
3693
3694 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3695 for (auto En : enumerate(ReductionInfos)) {
3696 const ReductionInfo &RI = En.value();
3697 Value *LHS = RI.Variable;
3698 Value *RHS =
3700
3702 Value *LHSPtr, *RHSPtr;
3704 &LHSPtr, &RHSPtr, CurFunc));
3705
3706 // Fix the CallBack code genereated to use the correct Values for the LHS
3707 // and RHS
3708 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3709 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3710 ReductionFunc;
3711 });
3712 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3713 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3714 ReductionFunc;
3715 });
3716 } else {
3717 assert(false && "Unhandled ReductionGenCBKind");
3718 }
3719 }
3720 emitBlock(ExitBB, CurFunc);
3721
3723
3724 return Builder.saveIP();
3725}
3726
3728 Type *VoidTy = Type::getVoidTy(M.getContext());
3729 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3730 auto *FuncTy =
3731 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3733 ".omp.reduction.func", &M);
3734}
3735
3738 InsertPointTy AllocaIP,
3739 ArrayRef<ReductionInfo> ReductionInfos,
3740 ArrayRef<bool> IsByRef, bool IsNoWait) {
3741 assert(ReductionInfos.size() == IsByRef.size());
3742 for (const ReductionInfo &RI : ReductionInfos) {
3743 (void)RI;
3744 assert(RI.Variable && "expected non-null variable");
3745 assert(RI.PrivateVariable && "expected non-null private variable");
3746 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3747 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3748 "expected variables and their private equivalents to have the same "
3749 "type");
3750 assert(RI.Variable->getType()->isPointerTy() &&
3751 "expected variables to be pointers");
3752 }
3753
3754 if (!updateToLocation(Loc))
3755 return InsertPointTy();
3756
3757 BasicBlock *InsertBlock = Loc.IP.getBlock();
3758 BasicBlock *ContinuationBlock =
3759 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3760 InsertBlock->getTerminator()->eraseFromParent();
3761
3762 // Create and populate array of type-erased pointers to private reduction
3763 // values.
3764 unsigned NumReductions = ReductionInfos.size();
3765 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3767 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3768
3769 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3770
3771 for (auto En : enumerate(ReductionInfos)) {
3772 unsigned Index = En.index();
3773 const ReductionInfo &RI = En.value();
3774 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3775 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3776 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3777 }
3778
3779 // Emit a call to the runtime function that orchestrates the reduction.
3780 // Declare the reduction function in the process.
3782 Module *Module = Func->getParent();
3783 uint32_t SrcLocStrSize;
3784 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3785 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3786 return RI.AtomicReductionGen;
3787 });
3788 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3789 CanGenerateAtomic
3790 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3791 : IdentFlag(0));
3792 Value *ThreadId = getOrCreateThreadID(Ident);
3793 Constant *NumVariables = Builder.getInt32(NumReductions);
3794 const DataLayout &DL = Module->getDataLayout();
3795 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3796 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3797 Function *ReductionFunc = getFreshReductionFunc(*Module);
3798 Value *Lock = getOMPCriticalRegionLock(".reduction");
3800 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3801 : RuntimeFunction::OMPRTL___kmpc_reduce);
3802 CallInst *ReduceCall =
3803 Builder.CreateCall(ReduceFunc,
3804 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3805 ReductionFunc, Lock},
3806 "reduce");
3807
3808 // Create final reduction entry blocks for the atomic and non-atomic case.
3809 // Emit IR that dispatches control flow to one of the blocks based on the
3810 // reduction supporting the atomic mode.
3811 BasicBlock *NonAtomicRedBlock =
3812 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3813 BasicBlock *AtomicRedBlock =
3814 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3815 SwitchInst *Switch =
3816 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3817 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3818 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3819
3820 // Populate the non-atomic reduction using the elementwise reduction function.
3821 // This loads the elements from the global and private variables and reduces
3822 // them before storing back the result to the global variable.
3823 Builder.SetInsertPoint(NonAtomicRedBlock);
3824 for (auto En : enumerate(ReductionInfos)) {
3825 const ReductionInfo &RI = En.value();
3827 // We have one less load for by-ref case because that load is now inside of
3828 // the reduction region
3829 Value *RedValue = RI.Variable;
3830 if (!IsByRef[En.index()]) {
3831 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3832 "red.value." + Twine(En.index()));
3833 }
3834 Value *PrivateRedValue =
3836 "red.private.value." + Twine(En.index()));
3837 Value *Reduced;
3838 InsertPointOrErrorTy AfterIP =
3839 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3840 if (!AfterIP)
3841 return AfterIP.takeError();
3842 Builder.restoreIP(*AfterIP);
3843
3844 if (!Builder.GetInsertBlock())
3845 return InsertPointTy();
3846 // for by-ref case, the load is inside of the reduction region
3847 if (!IsByRef[En.index()])
3848 Builder.CreateStore(Reduced, RI.Variable);
3849 }
3850 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3851 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3852 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3853 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3854 Builder.CreateBr(ContinuationBlock);
3855
3856 // Populate the atomic reduction using the atomic elementwise reduction
3857 // function. There are no loads/stores here because they will be happening
3858 // inside the atomic elementwise reduction.
3859 Builder.SetInsertPoint(AtomicRedBlock);
3860 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3861 for (const ReductionInfo &RI : ReductionInfos) {
3864 if (!AfterIP)
3865 return AfterIP.takeError();
3866 Builder.restoreIP(*AfterIP);
3867 if (!Builder.GetInsertBlock())
3868 return InsertPointTy();
3869 }
3870 Builder.CreateBr(ContinuationBlock);
3871 } else {
3873 }
3874
3875 // Populate the outlined reduction function using the elementwise reduction
3876 // function. Partial values are extracted from the type-erased array of
3877 // pointers to private variables.
3878 BasicBlock *ReductionFuncBlock =
3879 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3880 Builder.SetInsertPoint(ReductionFuncBlock);
3881 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3882 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3883
3884 for (auto En : enumerate(ReductionInfos)) {
3885 const ReductionInfo &RI = En.value();
3887 RedArrayTy, LHSArrayPtr, 0, En.index());
3888 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3889 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3890 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3892 RedArrayTy, RHSArrayPtr, 0, En.index());
3893 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3894 Value *RHSPtr =
3896 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3897 Value *Reduced;
3898 InsertPointOrErrorTy AfterIP =
3899 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3900 if (!AfterIP)
3901 return AfterIP.takeError();
3902 Builder.restoreIP(*AfterIP);
3903 if (!Builder.GetInsertBlock())
3904 return InsertPointTy();
3905 // store is inside of the reduction region when using by-ref
3906 if (!IsByRef[En.index()])
3907 Builder.CreateStore(Reduced, LHSPtr);
3908 }
3910
3911 Builder.SetInsertPoint(ContinuationBlock);
3912 return Builder.saveIP();
3913}
3914
3917 BodyGenCallbackTy BodyGenCB,
3918 FinalizeCallbackTy FiniCB) {
3919 if (!updateToLocation(Loc))
3920 return Loc.IP;
3921
3922 Directive OMPD = Directive::OMPD_master;
3923 uint32_t SrcLocStrSize;
3924 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3925 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3926 Value *ThreadId = getOrCreateThreadID(Ident);
3927 Value *Args[] = {Ident, ThreadId};
3928
3929 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3930 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3931
3932 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3933 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3934
3935 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3936 /*Conditional*/ true, /*hasFinalize*/ true);
3937}
3938
3941 BodyGenCallbackTy BodyGenCB,
3942 FinalizeCallbackTy FiniCB, Value *Filter) {
3943 if (!updateToLocation(Loc))
3944 return Loc.IP;
3945
3946 Directive OMPD = Directive::OMPD_masked;
3947 uint32_t SrcLocStrSize;
3948 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3949 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3950 Value *ThreadId = getOrCreateThreadID(Ident);
3951 Value *Args[] = {Ident, ThreadId, Filter};
3952 Value *ArgsEnd[] = {Ident, ThreadId};
3953
3954 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3955 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3956
3957 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3958 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3959
3960 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3961 /*Conditional*/ true, /*hasFinalize*/ true);
3962}
3963
3965 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3966 BasicBlock *PostInsertBefore, const Twine &Name) {
3967 Module *M = F->getParent();
3968 LLVMContext &Ctx = M->getContext();
3969 Type *IndVarTy = TripCount->getType();
3970
3971 // Create the basic block structure.
3972 BasicBlock *Preheader =
3973 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3974 BasicBlock *Header =
3975 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3976 BasicBlock *Cond =
3977 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3978 BasicBlock *Body =
3979 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3980 BasicBlock *Latch =
3981 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3982 BasicBlock *Exit =
3983 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3984 BasicBlock *After =
3985 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3986
3987 // Use specified DebugLoc for new instructions.
3989
3990 Builder.SetInsertPoint(Preheader);
3991 Builder.CreateBr(Header);
3992
3993 Builder.SetInsertPoint(Header);
3994 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3995 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3997
3999 Value *Cmp =
4000 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4001 Builder.CreateCondBr(Cmp, Body, Exit);
4002
4003 Builder.SetInsertPoint(Body);
4004 Builder.CreateBr(Latch);
4005
4006 Builder.SetInsertPoint(Latch);
4007 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4008 "omp_" + Name + ".next", /*HasNUW=*/true);
4009 Builder.CreateBr(Header);
4010 IndVarPHI->addIncoming(Next, Latch);
4011
4012 Builder.SetInsertPoint(Exit);
4014
4015 // Remember and return the canonical control flow.
4016 LoopInfos.emplace_front();
4017 CanonicalLoopInfo *CL = &LoopInfos.front();
4018
4019 CL->Header = Header;
4020 CL->Cond = Cond;
4021 CL->Latch = Latch;
4022 CL->Exit = Exit;
4023
4024#ifndef NDEBUG
4025 CL->assertOK();
4026#endif
4027 return CL;
4028}
4029
4032 LoopBodyGenCallbackTy BodyGenCB,
4033 Value *TripCount, const Twine &Name) {
4034 BasicBlock *BB = Loc.IP.getBlock();
4035 BasicBlock *NextBB = BB->getNextNode();
4036
4037 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4038 NextBB, NextBB, Name);
4039 BasicBlock *After = CL->getAfter();
4040
4041 // If location is not set, don't connect the loop.
4042 if (updateToLocation(Loc)) {
4043 // Split the loop at the insertion point: Branch to the preheader and move
4044 // every following instruction to after the loop (the After BB). Also, the
4045 // new successor is the loop's after block.
4046 spliceBB(Builder, After, /*CreateBranch=*/false);
4048 }
4049
4050 // Emit the body content. We do it after connecting the loop to the CFG to
4051 // avoid that the callback encounters degenerate BBs.
4052 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4053 return Err;
4054
4055#ifndef NDEBUG
4056 CL->assertOK();
4057#endif
4058 return CL;
4059}
4060
4062 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4063 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4064 InsertPointTy ComputeIP, const Twine &Name) {
4065
4066 // Consider the following difficulties (assuming 8-bit signed integers):
4067 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4068 // DO I = 1, 100, 50
4069 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4070 // DO I = 100, 0, -128
4071
4072 // Start, Stop and Step must be of the same integer type.
4073 auto *IndVarTy = cast<IntegerType>(Start->getType());
4074 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4075 assert(IndVarTy == Step->getType() && "Step type mismatch");
4076
4077 LocationDescription ComputeLoc =
4078 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4079 updateToLocation(ComputeLoc);
4080
4081 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4082 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4083
4084 // Like Step, but always positive.
4085 Value *Incr = Step;
4086
4087 // Distance between Start and Stop; always positive.
4088 Value *Span;
4089
4090 // Condition whether there are no iterations are executed at all, e.g. because
4091 // UB < LB.
4092 Value *ZeroCmp;
4093
4094 if (IsSigned) {
4095 // Ensure that increment is positive. If not, negate and invert LB and UB.
4096 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4097 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4098 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4099 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4100 Span = Builder.CreateSub(UB, LB, "", false, true);
4101 ZeroCmp = Builder.CreateICmp(
4102 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4103 } else {
4104 Span = Builder.CreateSub(Stop, Start, "", true);
4105 ZeroCmp = Builder.CreateICmp(
4106 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4107 }
4108
4109 Value *CountIfLooping;
4110 if (InclusiveStop) {
4111 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4112 } else {
4113 // Avoid incrementing past stop since it could overflow.
4114 Value *CountIfTwo = Builder.CreateAdd(
4115 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4116 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4117 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4118 }
4119 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4120 "omp_" + Name + ".tripcount");
4121
4122 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4123 Builder.restoreIP(CodeGenIP);
4124 Value *Span = Builder.CreateMul(IV, Step);
4125 Value *IndVar = Builder.CreateAdd(Span, Start);
4126 return BodyGenCB(Builder.saveIP(), IndVar);
4127 };
4128 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4129 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4130}
4131
4132// Returns an LLVM function to call for initializing loop bounds using OpenMP
4133// static scheduling depending on `type`. Only i32 and i64 are supported by the
4134// runtime. Always interpret integers as unsigned similarly to
4135// CanonicalLoopInfo.
4137 OpenMPIRBuilder &OMPBuilder) {
4138 unsigned Bitwidth = Ty->getIntegerBitWidth();
4139 if (Bitwidth == 32)
4140 return OMPBuilder.getOrCreateRuntimeFunction(
4141 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4142 if (Bitwidth == 64)
4143 return OMPBuilder.getOrCreateRuntimeFunction(
4144 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4145 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4146}
4147
4149OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4150 InsertPointTy AllocaIP,
4151 bool NeedsBarrier) {
4152 assert(CLI->isValid() && "Requires a valid canonical loop");
4153 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4154 "Require dedicated allocate IP");
4155
4156 // Set up the source location value for OpenMP runtime.
4159
4160 uint32_t SrcLocStrSize;
4161 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4162 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4163
4164 // Declare useful OpenMP runtime functions.
4165 Value *IV = CLI->getIndVar();
4166 Type *IVTy = IV->getType();
4167 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4168 FunctionCallee StaticFini =
4169 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4170
4171 // Allocate space for computed loop bounds as expected by the "init" function.
4172 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4173
4174 Type *I32Type = Type::getInt32Ty(M.getContext());
4175 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4176 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4177 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4178 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4179
4180 // At the end of the preheader, prepare for calling the "init" function by
4181 // storing the current loop bounds into the allocated space. A canonical loop
4182 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4183 // and produces an inclusive upper bound.
4185 Constant *Zero = ConstantInt::get(IVTy, 0);
4186 Constant *One = ConstantInt::get(IVTy, 1);
4187 Builder.CreateStore(Zero, PLowerBound);
4188 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4189 Builder.CreateStore(UpperBound, PUpperBound);
4190 Builder.CreateStore(One, PStride);
4191
4192 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4193
4194 Constant *SchedulingType = ConstantInt::get(
4195 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4196
4197 // Call the "init" function and update the trip count of the loop with the
4198 // value it produced.
4199 Builder.CreateCall(StaticInit,
4200 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4201 PUpperBound, PStride, One, Zero});
4202 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4203 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4204 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4205 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4206 CLI->setTripCount(TripCount);
4207
4208 // Update all uses of the induction variable except the one in the condition
4209 // block that compares it with the actual upper bound, and the increment in
4210 // the latch block.
4211
4212 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4214 CLI->getBody()->getFirstInsertionPt());
4216 return Builder.CreateAdd(OldIV, LowerBound);
4217 });
4218
4219 // In the "exit" block, call the "fini" function.
4221 CLI->getExit()->getTerminator()->getIterator());
4222 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4223
4224 // Add the barrier if requested.
4225 if (NeedsBarrier) {
4226 InsertPointOrErrorTy BarrierIP =
4227 createBarrier(LocationDescription(Builder.saveIP(), DL),
4228 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4229 /* CheckCancelFlag */ false);
4230 if (!BarrierIP)
4231 return BarrierIP.takeError();
4232 }
4233
4234 InsertPointTy AfterIP = CLI->getAfterIP();
4235 CLI->invalidate();
4236
4237 return AfterIP;
4238}
4239
4241OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4242 CanonicalLoopInfo *CLI,
4243 InsertPointTy AllocaIP,
4244 bool NeedsBarrier,
4245 Value *ChunkSize) {
4246 assert(CLI->isValid() && "Requires a valid canonical loop");
4247 assert(ChunkSize && "Chunk size is required");
4248
4249 LLVMContext &Ctx = CLI->getFunction()->getContext();
4250 Value *IV = CLI->getIndVar();
4251 Value *OrigTripCount = CLI->getTripCount();
4252 Type *IVTy = IV->getType();
4253 assert(IVTy->getIntegerBitWidth() <= 64 &&
4254 "Max supported tripcount bitwidth is 64 bits");
4255 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4256 : Type::getInt64Ty(Ctx);
4257 Type *I32Type = Type::getInt32Ty(M.getContext());
4258 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4259 Constant *One = ConstantInt::get(InternalIVTy, 1);
4260
4261 // Declare useful OpenMP runtime functions.
4262 FunctionCallee StaticInit =
4263 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4264 FunctionCallee StaticFini =
4265 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4266
4267 // Allocate space for computed loop bounds as expected by the "init" function.
4268 Builder.restoreIP(AllocaIP);
4270 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4271 Value *PLowerBound =
4272 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4273 Value *PUpperBound =
4274 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4275 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4276
4277 // Set up the source location value for the OpenMP runtime.
4280
4281 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4282 Value *CastedChunkSize =
4283 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4284 Value *CastedTripCount =
4285 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4286
4287 Constant *SchedulingType = ConstantInt::get(
4288 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4289 Builder.CreateStore(Zero, PLowerBound);
4290 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4291 Builder.CreateStore(OrigUpperBound, PUpperBound);
4292 Builder.CreateStore(One, PStride);
4293
4294 // Call the "init" function and update the trip count of the loop with the
4295 // value it produced.
4296 uint32_t SrcLocStrSize;
4297 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4298 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4299 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4300 Builder.CreateCall(StaticInit,
4301 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4302 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4303 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4304 /*pstride=*/PStride, /*incr=*/One,
4305 /*chunk=*/CastedChunkSize});
4306
4307 // Load values written by the "init" function.
4308 Value *FirstChunkStart =
4309 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4310 Value *FirstChunkStop =
4311 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4312 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4313 Value *ChunkRange =
4314 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4315 Value *NextChunkStride =
4316 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4317
4318 // Create outer "dispatch" loop for enumerating the chunks.
4319 BasicBlock *DispatchEnter = splitBB(Builder, true);
4320 Value *DispatchCounter;
4321
4322 // It is safe to assume this didn't return an error because the callback
4323 // passed into createCanonicalLoop is the only possible error source, and it
4324 // always returns success.
4326 {Builder.saveIP(), DL},
4327 [&](InsertPointTy BodyIP, Value *Counter) {
4328 DispatchCounter = Counter;
4329 return Error::success();
4330 },
4331 FirstChunkStart, CastedTripCount, NextChunkStride,
4332 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4333 "dispatch"));
4334
4335 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4336 // not have to preserve the canonical invariant.
4337 BasicBlock *DispatchBody = DispatchCLI->getBody();
4338 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4339 BasicBlock *DispatchExit = DispatchCLI->getExit();
4340 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4341 DispatchCLI->invalidate();
4342
4343 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4344 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4345 redirectTo(CLI->getExit(), DispatchLatch, DL);
4346 redirectTo(DispatchBody, DispatchEnter, DL);
4347
4348 // Prepare the prolog of the chunk loop.
4351
4352 // Compute the number of iterations of the chunk loop.
4354 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4355 Value *IsLastChunk =
4356 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4357 Value *CountUntilOrigTripCount =
4358 Builder.CreateSub(CastedTripCount, DispatchCounter);
4359 Value *ChunkTripCount = Builder.CreateSelect(
4360 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4361 Value *BackcastedChunkTC =
4362 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4363 CLI->setTripCount(BackcastedChunkTC);
4364
4365 // Update all uses of the induction variable except the one in the condition
4366 // block that compares it with the actual upper bound, and the increment in
4367 // the latch block.
4368 Value *BackcastedDispatchCounter =
4369 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4370 CLI->mapIndVar([&](Instruction *) -> Value * {
4371 Builder.restoreIP(CLI->getBodyIP());
4372 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4373 });
4374
4375 // In the "exit" block, call the "fini" function.
4376 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4377 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4378
4379 // Add the barrier if requested.
4380 if (NeedsBarrier) {
4381 InsertPointOrErrorTy AfterIP =
4382 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4383 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4384 if (!AfterIP)
4385 return AfterIP.takeError();
4386 }
4387
4388#ifndef NDEBUG
4389 // Even though we currently do not support applying additional methods to it,
4390 // the chunk loop should remain a canonical loop.
4391 CLI->assertOK();
4392#endif
4393
4394 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4395}
4396
4397// Returns an LLVM function to call for executing an OpenMP static worksharing
4398// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4399// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4400static FunctionCallee
4402 WorksharingLoopType LoopType) {
4403 unsigned Bitwidth = Ty->getIntegerBitWidth();
4404 Module &M = OMPBuilder->M;
4405 switch (LoopType) {
4406 case WorksharingLoopType::ForStaticLoop:
4407 if (Bitwidth == 32)
4408 return OMPBuilder->getOrCreateRuntimeFunction(
4409 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4410 if (Bitwidth == 64)
4411 return OMPBuilder->getOrCreateRuntimeFunction(
4412 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4413 break;
4414 case WorksharingLoopType::DistributeStaticLoop:
4415 if (Bitwidth == 32)
4416 return OMPBuilder->getOrCreateRuntimeFunction(
4417 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4418 if (Bitwidth == 64)
4419 return OMPBuilder->getOrCreateRuntimeFunction(
4420 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4421 break;
4422 case WorksharingLoopType::DistributeForStaticLoop:
4423 if (Bitwidth == 32)
4424 return OMPBuilder->getOrCreateRuntimeFunction(
4425 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4426 if (Bitwidth == 64)
4427 return OMPBuilder->getOrCreateRuntimeFunction(
4428 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4429 break;
4430 }
4431 if (Bitwidth != 32 && Bitwidth != 64) {
4432 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4433 }
4434 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4435}
4436
4437// Inserts a call to proper OpenMP Device RTL function which handles
4438// loop worksharing.
4440 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4441 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4442 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4443 Type *TripCountTy = TripCount->getType();
4444 Module &M = OMPBuilder->M;
4445 IRBuilder<> &Builder = OMPBuilder->Builder;
4446 FunctionCallee RTLFn =
4447 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4448 SmallVector<Value *, 8> RealArgs;
4449 RealArgs.push_back(Ident);
4450 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4451 RealArgs.push_back(LoopBodyArg);
4452 RealArgs.push_back(TripCount);
4453 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4454 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4455 Builder.CreateCall(RTLFn, RealArgs);
4456 return;
4457 }
4458 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4459 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4460 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4461 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4462
4463 RealArgs.push_back(
4464 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4465 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4466 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4467 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4468 }
4469
4470 Builder.CreateCall(RTLFn, RealArgs);
4471}
4472
4473static void
4475 CanonicalLoopInfo *CLI, Value *Ident,
4476 Function &OutlinedFn, Type *ParallelTaskPtr,
4477 const SmallVector<Instruction *, 4> &ToBeDeleted,
4478 WorksharingLoopType LoopType) {
4479 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4480 BasicBlock *Preheader = CLI->getPreheader();
4481 Value *TripCount = CLI->getTripCount();
4482
4483 // After loop body outling, the loop body contains only set up
4484 // of loop body argument structure and the call to the outlined
4485 // loop body function. Firstly, we need to move setup of loop body args
4486 // into loop preheader.
4487 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4488 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4489
4490 // The next step is to remove the whole loop. We do not it need anymore.
4491 // That's why make an unconditional branch from loop preheader to loop
4492 // exit block
4493 Builder.restoreIP({Preheader, Preheader->end()});
4494 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4495 Preheader->getTerminator()->eraseFromParent();
4496 Builder.CreateBr(CLI->getExit());
4497
4498 // Delete dead loop blocks
4499 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4500 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4501 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4502 CleanUpInfo.EntryBB = CLI->getHeader();
4503 CleanUpInfo.ExitBB = CLI->getExit();
4504 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4505 DeleteDeadBlocks(BlocksToBeRemoved);
4506
4507 // Find the instruction which corresponds to loop body argument structure
4508 // and remove the call to loop body function instruction.
4509 Value *LoopBodyArg;
4510 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4511 assert(OutlinedFnUser &&
4512 "Expected unique undroppable user of outlined function");
4513 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4514 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4515 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4516 "Expected outlined function call to be located in loop preheader");
4517 // Check in case no argument structure has been passed.
4518 if (OutlinedFnCallInstruction->arg_size() > 1)
4519 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4520 else
4521 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4522 OutlinedFnCallInstruction->eraseFromParent();
4523
4524 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4525 LoopBodyArg, ParallelTaskPtr, TripCount,
4526 OutlinedFn);
4527
4528 for (auto &ToBeDeletedItem : ToBeDeleted)
4529 ToBeDeletedItem->eraseFromParent();
4530 CLI->invalidate();
4531}
4532
4534OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4535 InsertPointTy AllocaIP,
4536 WorksharingLoopType LoopType) {
4537 uint32_t SrcLocStrSize;
4538 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4539 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4540
4541 OutlineInfo OI;
4542 OI.OuterAllocaBB = CLI->getPreheader();
4543 Function *OuterFn = CLI->getPreheader()->getParent();
4544
4545 // Instructions which need to be deleted at the end of code generation
4547
4548 OI.OuterAllocaBB = AllocaIP.getBlock();
4549
4550 // Mark the body loop as region which needs to be extracted
4551 OI.EntryBB = CLI->getBody();
4552 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4553 "omp.prelatch", true);
4554
4555 // Prepare loop body for extraction
4556 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4557
4558 // Insert new loop counter variable which will be used only in loop
4559 // body.
4560 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4561 Instruction *NewLoopCntLoad =
4562 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4563 // New loop counter instructions are redundant in the loop preheader when
4564 // code generation for workshare loop is finshed. That's why mark them as
4565 // ready for deletion.
4566 ToBeDeleted.push_back(NewLoopCntLoad);
4567 ToBeDeleted.push_back(NewLoopCnt);
4568
4569 // Analyse loop body region. Find all input variables which are used inside
4570 // loop body region.
4571 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4573 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4574 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4575 ParallelRegionBlockSet.end());
4576
4577 CodeExtractorAnalysisCache CEAC(*OuterFn);
4578 CodeExtractor Extractor(Blocks,
4579 /* DominatorTree */ nullptr,
4580 /* AggregateArgs */ true,
4581 /* BlockFrequencyInfo */ nullptr,
4582 /* BranchProbabilityInfo */ nullptr,
4583 /* AssumptionCache */ nullptr,
4584 /* AllowVarArgs */ true,
4585 /* AllowAlloca */ true,
4586 /* AllocationBlock */ CLI->getPreheader(),
4587 /* Suffix */ ".omp_wsloop",
4588 /* AggrArgsIn0AddrSpace */ true);
4589
4590 BasicBlock *CommonExit = nullptr;
4591 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4592
4593 // Find allocas outside the loop body region which are used inside loop
4594 // body
4595 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4596
4597 // We need to model loop body region as the function f(cnt, loop_arg).
4598 // That's why we replace loop induction variable by the new counter
4599 // which will be one of loop body function argument
4601 CLI->getIndVar()->user_end());
4602 for (auto Use : Users) {
4603 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4604 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4605 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4606 }
4607 }
4608 }
4609 // Make sure that loop counter variable is not merged into loop body
4610 // function argument structure and it is passed as separate variable
4611 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4612
4613 // PostOutline CB is invoked when loop body function is outlined and
4614 // loop body is replaced by call to outlined function. We need to add
4615 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4616 // function will handle loop control logic.
4617 //
4618 OI.PostOutlineCB = [=, ToBeDeletedVec =
4619 std::move(ToBeDeleted)](Function &OutlinedFn) {
4620 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4621 ToBeDeletedVec, LoopType);
4622 };
4623 addOutlineInfo(std::move(OI));
4624 return CLI->getAfterIP();
4625}
4626
4629 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4630 bool HasSimdModifier, bool HasMonotonicModifier,
4631 bool HasNonmonotonicModifier, bool HasOrderedClause,
4632 WorksharingLoopType LoopType) {
4633 if (Config.isTargetDevice())
4634 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4635 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4636 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4637 HasNonmonotonicModifier, HasOrderedClause);
4638
4639 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4640 OMPScheduleType::ModifierOrdered;
4641 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4642 case OMPScheduleType::BaseStatic:
4643 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4644 if (IsOrdered)
4645 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4646 NeedsBarrier, ChunkSize);
4647 // FIXME: Monotonicity ignored?
4648 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4649
4650 case OMPScheduleType::BaseStaticChunked:
4651 if (IsOrdered)
4652 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4653 NeedsBarrier, ChunkSize);
4654 // FIXME: Monotonicity ignored?
4655 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4656 ChunkSize);
4657
4658 case OMPScheduleType::BaseRuntime:
4659 case OMPScheduleType::BaseAuto:
4660 case OMPScheduleType::BaseGreedy:
4661 case OMPScheduleType::BaseBalanced:
4662 case OMPScheduleType::BaseSteal:
4663 case OMPScheduleType::BaseGuidedSimd:
4664 case OMPScheduleType::BaseRuntimeSimd:
4665 assert(!ChunkSize &&
4666 "schedule type does not support user-defined chunk sizes");
4667 [[fallthrough]];
4668 case OMPScheduleType::BaseDynamicChunked:
4669 case OMPScheduleType::BaseGuidedChunked:
4670 case OMPScheduleType::BaseGuidedIterativeChunked:
4671 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4672 case OMPScheduleType::BaseStaticBalancedChunked:
4673 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4674 NeedsBarrier, ChunkSize);
4675
4676 default:
4677 llvm_unreachable("Unknown/unimplemented schedule kind");
4678 }
4679}
4680
4681/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4682/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4683/// the runtime. Always interpret integers as unsigned similarly to
4684/// CanonicalLoopInfo.
4685static FunctionCallee
4687 unsigned Bitwidth = Ty->getIntegerBitWidth();
4688 if (Bitwidth == 32)
4689 return OMPBuilder.getOrCreateRuntimeFunction(
4690 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4691 if (Bitwidth == 64)
4692 return OMPBuilder.getOrCreateRuntimeFunction(
4693 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4694 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4695}
4696
4697/// Returns an LLVM function to call for updating the next loop using OpenMP
4698/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4699/// the runtime. Always interpret integers as unsigned similarly to
4700/// CanonicalLoopInfo.
4701static FunctionCallee
4703 unsigned Bitwidth = Ty->getIntegerBitWidth();
4704 if (Bitwidth == 32)
4705 return OMPBuilder.getOrCreateRuntimeFunction(
4706 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4707 if (Bitwidth == 64)
4708 return OMPBuilder.getOrCreateRuntimeFunction(
4709 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4710 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4711}
4712
4713/// Returns an LLVM function to call for finalizing the dynamic loop using
4714/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4715/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4716static FunctionCallee
4718 unsigned Bitwidth = Ty->getIntegerBitWidth();
4719 if (Bitwidth == 32)
4720 return OMPBuilder.getOrCreateRuntimeFunction(
4721 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4722 if (Bitwidth == 64)
4723 return OMPBuilder.getOrCreateRuntimeFunction(
4724 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4725 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4726}
4727
4729OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4730 InsertPointTy AllocaIP,
4731 OMPScheduleType SchedType,
4732 bool NeedsBarrier, Value *Chunk) {
4733 assert(CLI->isValid() && "Requires a valid canonical loop");
4734 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4735 "Require dedicated allocate IP");
4737 "Require valid schedule type");
4738
4739 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4740 OMPScheduleType::ModifierOrdered;
4741
4742 // Set up the source location value for OpenMP runtime.
4744
4745 uint32_t SrcLocStrSize;
4746 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4747 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4748
4749 // Declare useful OpenMP runtime functions.
4750 Value *IV = CLI->getIndVar();
4751 Type *IVTy = IV->getType();
4752 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4753 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4754
4755 // Allocate space for computed loop bounds as expected by the "init" function.
4756 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4757 Type *I32Type = Type::getInt32Ty(M.getContext());
4758 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4759 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4760 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4761 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4762
4763 // At the end of the preheader, prepare for calling the "init" function by
4764 // storing the current loop bounds into the allocated space. A canonical loop
4765 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4766 // and produces an inclusive upper bound.
4767 BasicBlock *PreHeader = CLI->getPreheader();
4768 Builder.SetInsertPoint(PreHeader->getTerminator());
4769 Constant *One = ConstantInt::get(IVTy, 1);
4770 Builder.CreateStore(One, PLowerBound);
4771 Value *UpperBound = CLI->getTripCount();
4772 Builder.CreateStore(UpperBound, PUpperBound);
4773 Builder.CreateStore(One, PStride);
4774
4775 BasicBlock *Header = CLI->getHeader();
4776 BasicBlock *Exit = CLI->getExit();
4777 BasicBlock *Cond = CLI->getCond();
4778 BasicBlock *Latch = CLI->getLatch();
4779 InsertPointTy AfterIP = CLI->getAfterIP();
4780
4781 // The CLI will be "broken" in the code below, as the loop is no longer
4782 // a valid canonical loop.
4783
4784 if (!Chunk)
4785 Chunk = One;
4786
4787 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4788
4789 Constant *SchedulingType =
4790 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4791
4792 // Call the "init" function.
4793 Builder.CreateCall(DynamicInit,
4794 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4795 UpperBound, /* step */ One, Chunk});
4796
4797 // An outer loop around the existing one.
4798 BasicBlock *OuterCond = BasicBlock::Create(
4799 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4800 PreHeader->getParent());
4801 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4802 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4803 Value *Res =
4804 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4805 PLowerBound, PUpperBound, PStride});
4806 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4807 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4808 Value *LowerBound =
4809 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4810 Builder.CreateCondBr(MoreWork, Header, Exit);
4811
4812 // Change PHI-node in loop header to use outer cond rather than preheader,
4813 // and set IV to the LowerBound.
4814 Instruction *Phi = &Header->front();
4815 auto *PI = cast<PHINode>(Phi);
4816 PI->setIncomingBlock(0, OuterCond);
4817 PI->setIncomingValue(0, LowerBound);
4818
4819 // Then set the pre-header to jump to the OuterCond
4820 Instruction *Term = PreHeader->getTerminator();
4821 auto *Br = cast<BranchInst>(Term);
4822 Br->setSuccessor(0, OuterCond);
4823
4824 // Modify the inner condition:
4825 // * Use the UpperBound returned from the DynamicNext call.
4826 // * jump to the loop outer loop when done with one of the inner loops.
4827 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4828 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4830 auto *CI = cast<CmpInst>(Comp);
4831 CI->setOperand(1, UpperBound);
4832 // Redirect the inner exit to branch to outer condition.
4833 Instruction *Branch = &Cond->back();
4834 auto *BI = cast<BranchInst>(Branch);
4835 assert(BI->getSuccessor(1) == Exit);
4836 BI->setSuccessor(1, OuterCond);
4837
4838 // Call the "fini" function if "ordered" is present in wsloop directive.
4839 if (Ordered) {
4840 Builder.SetInsertPoint(&Latch->back());
4841 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4842 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4843 }
4844
4845 // Add the barrier if requested.
4846 if (NeedsBarrier) {
4847 Builder.SetInsertPoint(&Exit->back());
4848 InsertPointOrErrorTy BarrierIP =
4849 createBarrier(LocationDescription(Builder.saveIP(), DL),
4850 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4851 /* CheckCancelFlag */ false);
4852 if (!BarrierIP)
4853 return BarrierIP.takeError();
4854 }
4855
4856 CLI->invalidate();
4857 return AfterIP;
4858}
4859
4860/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4861/// after this \p OldTarget will be orphaned.
4863 BasicBlock *NewTarget, DebugLoc DL) {
4864 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4865 redirectTo(Pred, NewTarget, DL);
4866}
4867
4868/// Determine which blocks in \p BBs are reachable from outside and remove the
4869/// ones that are not reachable from the function.
4871 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4872 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4873 for (Use &U : BB->uses()) {
4874 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4875 if (!UseInst)
4876 continue;
4877 if (BBsToErase.count(UseInst->getParent()))
4878 continue;
4879 return true;
4880 }
4881 return false;
4882 };
4883
4884 while (BBsToErase.remove_if(HasRemainingUses)) {
4885 // Try again if anything was removed.
4886 }
4887
4888 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4889 DeleteDeadBlocks(BBVec);
4890}
4891
4894 InsertPointTy ComputeIP) {
4895 assert(Loops.size() >= 1 && "At least one loop required");
4896 size_t NumLoops = Loops.size();
4897
4898 // Nothing to do if there is already just one loop.
4899 if (NumLoops == 1)
4900 return Loops.front();
4901
4902 CanonicalLoopInfo *Outermost = Loops.front();
4903 CanonicalLoopInfo *Innermost = Loops.back();
4904 BasicBlock *OrigPreheader = Outermost->getPreheader();
4905 BasicBlock *OrigAfter = Outermost->getAfter();
4906 Function *F = OrigPreheader->getParent();
4907
4908 // Loop control blocks that may become orphaned later.
4909 SmallVector<BasicBlock *, 12> OldControlBBs;
4910 OldControlBBs.reserve(6 * Loops.size());
4912 Loop->collectControlBlocks(OldControlBBs);
4913
4914 // Setup the IRBuilder for inserting the trip count computation.
4916 if (ComputeIP.isSet())
4917 Builder.restoreIP(ComputeIP);
4918 else
4919 Builder.restoreIP(Outermost->getPreheaderIP());
4920
4921 // Derive the collapsed' loop trip count.
4922 // TODO: Find common/largest indvar type.
4923 Value *CollapsedTripCount = nullptr;
4924 for (CanonicalLoopInfo *L : Loops) {
4925 assert(L->isValid() &&
4926 "All loops to collapse must be valid canonical loops");
4927 Value *OrigTripCount = L->getTripCount();
4928 if (!CollapsedTripCount) {
4929 CollapsedTripCount = OrigTripCount;
4930 continue;
4931 }
4932
4933 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4934 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4935 {}, /*HasNUW=*/true);
4936 }
4937
4938 // Create the collapsed loop control flow.
4939 CanonicalLoopInfo *Result =
4940 createLoopSkeleton(DL, CollapsedTripCount, F,
4941 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4942
4943 // Build the collapsed loop body code.
4944 // Start with deriving the input loop induction variables from the collapsed
4945 // one, using a divmod scheme. To preserve the original loops' order, the
4946 // innermost loop use the least significant bits.
4947 Builder.restoreIP(Result->getBodyIP());
4948
4949 Value *Leftover = Result->getIndVar();
4950 SmallVector<Value *> NewIndVars;
4951 NewIndVars.resize(NumLoops);
4952 for (int i = NumLoops - 1; i >= 1; --i) {
4953 Value *OrigTripCount = Loops[i]->getTripCount();
4954
4955 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4956 NewIndVars[i] = NewIndVar;
4957
4958 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4959 }
4960 // Outermost loop gets all the remaining bits.
4961 NewIndVars[0] = Leftover;
4962
4963 // Construct the loop body control flow.
4964 // We progressively construct the branch structure following in direction of
4965 // the control flow, from the leading in-between code, the loop nest body, the
4966 // trailing in-between code, and rejoining the collapsed loop's latch.
4967 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4968 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4969 // its predecessors as sources.
4970 BasicBlock *ContinueBlock = Result->getBody();
4971 BasicBlock *ContinuePred = nullptr;
4972 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4973 BasicBlock *NextSrc) {
4974 if (ContinueBlock)
4975 redirectTo(ContinueBlock, Dest, DL);
4976 else
4977 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4978
4979 ContinueBlock = nullptr;
4980 ContinuePred = NextSrc;
4981 };
4982
4983 // The code before the nested loop of each level.
4984 // Because we are sinking it into the nest, it will be executed more often
4985 // that the original loop. More sophisticated schemes could keep track of what
4986 // the in-between code is and instantiate it only once per thread.
4987 for (size_t i = 0; i < NumLoops - 1; ++i)
4988 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4989
4990 // Connect the loop nest body.
4991 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4992
4993 // The code after the nested loop at each level.
4994 for (size_t i = NumLoops - 1; i > 0; --i)
4995 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4996
4997 // Connect the finished loop to the collapsed loop latch.
4998 ContinueWith(Result->getLatch(), nullptr);
4999
5000 // Replace the input loops with the new collapsed loop.
5001 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5002 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5003
5004 // Replace the input loop indvars with the derived ones.
5005 for (size_t i = 0; i < NumLoops; ++i)
5006 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5007
5008 // Remove unused parts of the input loops.
5009 removeUnusedBlocksFromParent(OldControlBBs);
5010
5011 for (CanonicalLoopInfo *L : Loops)
5012 L->invalidate();
5013
5014#ifndef NDEBUG
5015 Result->assertOK();
5016#endif
5017 return Result;
5018}
5019
5020std::vector<CanonicalLoopInfo *>
5022 ArrayRef<Value *> TileSizes) {
5023 assert(TileSizes.size() == Loops.size() &&
5024 "Must pass as many tile sizes as there are loops");
5025 int NumLoops = Loops.size();
5026 assert(NumLoops >= 1 && "At least one loop to tile required");
5027
5028 CanonicalLoopInfo *OutermostLoop = Loops.front();
5029 CanonicalLoopInfo *InnermostLoop = Loops.back();
5030 Function *F = OutermostLoop->getBody()->getParent();
5031 BasicBlock *InnerEnter = InnermostLoop->getBody();
5032 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5033
5034 // Loop control blocks that may become orphaned later.
5035 SmallVector<BasicBlock *, 12> OldControlBBs;
5036 OldControlBBs.reserve(6 * Loops.size());
5038 Loop->collectControlBlocks(OldControlBBs);
5039
5040 // Collect original trip counts and induction variable to be accessible by
5041 // index. Also, the structure of the original loops is not preserved during
5042 // the construction of the tiled loops, so do it before we scavenge the BBs of
5043 // any original CanonicalLoopInfo.
5044 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5045 for (CanonicalLoopInfo *L : Loops) {
5046 assert(L->isValid() && "All input loops must be valid canonical loops");
5047 OrigTripCounts.push_back(L->getTripCount());
5048 OrigIndVars.push_back(L->getIndVar());
5049 }
5050
5051 // Collect the code between loop headers. These may contain SSA definitions
5052 // that are used in the loop nest body. To be usable with in the innermost
5053 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5054 // these instructions may be executed more often than before the tiling.
5055 // TODO: It would be sufficient to only sink them into body of the
5056 // corresponding tile loop.
5058 for (int i = 0; i < NumLoops - 1; ++i) {
5059 CanonicalLoopInfo *Surrounding = Loops[i];
5060 CanonicalLoopInfo *Nested = Loops[i + 1];
5061
5062 BasicBlock *EnterBB = Surrounding->getBody();
5063 BasicBlock *ExitBB = Nested->getHeader();
5064 InbetweenCode.emplace_back(EnterBB, ExitBB);
5065 }
5066
5067 // Compute the trip counts of the floor loops.
5069 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5070 SmallVector<Value *, 4> FloorCount, FloorRems;
5071 for (int i = 0; i < NumLoops; ++i) {
5072 Value *TileSize = TileSizes[i];
5073 Value *OrigTripCount = OrigTripCounts[i];
5074 Type *IVType = OrigTripCount->getType();
5075
5076 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5077 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5078
5079 // 0 if tripcount divides the tilesize, 1 otherwise.
5080 // 1 means we need an additional iteration for a partial tile.
5081 //
5082 // Unfortunately we cannot just use the roundup-formula
5083 // (tripcount + tilesize - 1)/tilesize
5084 // because the summation might overflow. We do not want introduce undefined
5085 // behavior when the untiled loop nest did not.
5086 Value *FloorTripOverflow =
5087 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5088
5089 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5090 FloorTripCount =
5091 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5092 "omp_floor" + Twine(i) + ".tripcount", true);
5093
5094 // Remember some values for later use.
5095 FloorCount.push_back(FloorTripCount);
5096 FloorRems.push_back(FloorTripRem);
5097 }
5098
5099 // Generate the new loop nest, from the outermost to the innermost.
5100 std::vector<CanonicalLoopInfo *> Result;
5101 Result.reserve(NumLoops * 2);
5102
5103 // The basic block of the surrounding loop that enters the nest generated
5104 // loop.
5105 BasicBlock *Enter = OutermostLoop->getPreheader();
5106
5107 // The basic block of the surrounding loop where the inner code should
5108 // continue.
5109 BasicBlock *Continue = OutermostLoop->getAfter();
5110
5111 // Where the next loop basic block should be inserted.
5112 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5113
5114 auto EmbeddNewLoop =
5115 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5116 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5117 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5118 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5119 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5120 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5121
5122 // Setup the position where the next embedded loop connects to this loop.
5123 Enter = EmbeddedLoop->getBody();
5124 Continue = EmbeddedLoop->getLatch();
5125 OutroInsertBefore = EmbeddedLoop->getLatch();
5126 return EmbeddedLoop;
5127 };
5128
5129 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5130 const Twine &NameBase) {
5131 for (auto P : enumerate(TripCounts)) {
5132 CanonicalLoopInfo *EmbeddedLoop =
5133 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5134 Result.push_back(EmbeddedLoop);
5135 }
5136 };
5137
5138 EmbeddNewLoops(FloorCount, "floor");
5139
5140 // Within the innermost floor loop, emit the code that computes the tile
5141 // sizes.
5143 SmallVector<Value *, 4> TileCounts;
5144 for (int i = 0; i < NumLoops; ++i) {
5145 CanonicalLoopInfo *FloorLoop = Result[i];
5146 Value *TileSize = TileSizes[i];
5147
5148 Value *FloorIsEpilogue =
5149 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5150 Value *TileTripCount =
5151 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5152
5153 TileCounts.push_back(TileTripCount);
5154 }
5155
5156 // Create the tile loops.
5157 EmbeddNewLoops(TileCounts, "tile");
5158
5159 // Insert the inbetween code into the body.
5160 BasicBlock *BodyEnter = Enter;
5161 BasicBlock *BodyEntered = nullptr;
5162 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5163 BasicBlock *EnterBB = P.first;
5164 BasicBlock *ExitBB = P.second;
5165
5166 if (BodyEnter)
5167 redirectTo(BodyEnter, EnterBB, DL);
5168 else
5169 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5170
5171 BodyEnter = nullptr;
5172 BodyEntered = ExitBB;
5173 }
5174
5175 // Append the original loop nest body into the generated loop nest body.
5176 if (BodyEnter)
5177 redirectTo(BodyEnter, InnerEnter, DL);
5178 else
5179 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5181
5182 // Replace the original induction variable with an induction variable computed
5183 // from the tile and floor induction variables.
5184 Builder.restoreIP(Result.back()->getBodyIP());
5185 for (int i = 0; i < NumLoops; ++i) {
5186 CanonicalLoopInfo *FloorLoop = Result[i];
5187 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5188 Value *OrigIndVar = OrigIndVars[i];
5189 Value *Size = TileSizes[i];
5190
5191 Value *Scale =
5192 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5193 Value *Shift =
5194 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5195 OrigIndVar->replaceAllUsesWith(Shift);
5196 }
5197
5198 // Remove unused parts of the original loops.
5199 removeUnusedBlocksFromParent(OldControlBBs);
5200
5201 for (CanonicalLoopInfo *L : Loops)
5202 L->invalidate();
5203
5204#ifndef NDEBUG
5205 for (CanonicalLoopInfo *GenL : Result)
5206 GenL->assertOK();
5207#endif
5208 return Result;
5209}
5210
5211/// Attach metadata \p Properties to the basic block described by \p BB. If the
5212/// basic block already has metadata, the basic block properties are appended.
5214 ArrayRef<Metadata *> Properties) {
5215 // Nothing to do if no property to attach.
5216 if (Properties.empty())
5217 return;
5218
5219 LLVMContext &Ctx = BB->getContext();
5220 SmallVector<Metadata *> NewProperties;
5221 NewProperties.push_back(nullptr);
5222
5223 // If the basic block already has metadata, prepend it to the new metadata.
5224 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5225 if (Existing)
5226 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5227
5228 append_range(NewProperties, Properties);
5229 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5230 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5231
5232 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5233}
5234
5235/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5236/// loop already has metadata, the loop properties are appended.
5238 ArrayRef<Metadata *> Properties) {
5239 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5240
5241 // Attach metadata to the loop's latch
5242 BasicBlock *Latch = Loop->getLatch();
5243 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5244 addBasicBlockMetadata(Latch, Properties);
5245}
5246
5247/// Attach llvm.access.group metadata to the memref instructions of \p Block
5248static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5249 LoopInfo &LI) {
5250 for (Instruction &I : *Block) {
5251 if (I.mayReadOrWriteMemory()) {
5252 // TODO: This instruction may already have access group from
5253 // other pragmas e.g. #pragma clang loop vectorize. Append
5254 // so that the existing metadata is not overwritten.
5255 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5256 }
5257 }
5258}
5259
5263 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5264 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5265}
5266
5270 Loop, {
5271 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5272 });
5273}
5274
5275void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5276 Value *IfCond, ValueToValueMapTy &VMap,
5277 const Twine &NamePrefix) {
5278 Function *F = CanonicalLoop->getFunction();
5279
5280 // Define where if branch should be inserted
5281 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5282
5283 // TODO: We should not rely on pass manager. Currently we use pass manager
5284 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5285 // object. We should have a method which returns all blocks between
5286 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5288 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5289 FAM.registerPass([]() { return LoopAnalysis(); });
5290 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5291
5292 // Get the loop which needs to be cloned
5293 LoopAnalysis LIA;
5294 LoopInfo &&LI = LIA.run(*F, FAM);
5295 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5296
5297 // Create additional blocks for the if statement
5298 BasicBlock *Head = SplitBefore->getParent();
5299 Instruction *HeadOldTerm = Head->getTerminator();
5300 llvm::LLVMContext &C = Head->getContext();
5302 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5304 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5305
5306 // Create if condition branch.
5307 Builder.SetInsertPoint(HeadOldTerm);
5308 Instruction *BrInstr =
5309 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5310 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5311 // Then block contains branch to omp loop which needs to be vectorized
5312 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5313 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5314
5315 Builder.SetInsertPoint(ElseBlock);
5316
5317 // Clone loop for the else branch
5319
5320 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5321 for (BasicBlock *Block : L->getBlocks()) {
5322 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5323 NewBB->moveBefore(CanonicalLoop->getExit());
5324 VMap[Block] = NewBB;
5325 NewBlocks.push_back(NewBB);
5326 }
5327 remapInstructionsInBlocks(NewBlocks, VMap);
5328 Builder.CreateBr(NewBlocks.front());
5329}
5330
5331unsigned
5333 const StringMap<bool> &Features) {
5334 if (TargetTriple.isX86()) {
5335 if (Features.lookup("avx512f"))
5336 return 512;
5337 else if (Features.lookup("avx"))
5338 return 256;
5339 return 128;
5340 }
5341 if (TargetTriple.isPPC())
5342 return 128;
5343 if (TargetTriple.isWasm())
5344 return 128;
5345 return 0;
5346}
5347
5349 MapVector<Value *, Value *> AlignedVars,
5350 Value *IfCond, OrderKind Order,
5351 ConstantInt *Simdlen, ConstantInt *Safelen) {
5353
5354 Function *F = CanonicalLoop->getFunction();
5355
5356 // TODO: We should not rely on pass manager. Currently we use pass manager
5357 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5358 // object. We should have a method which returns all blocks between
5359 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5361 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5362 FAM.registerPass([]() { return LoopAnalysis(); });
5363 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5364
5365 LoopAnalysis LIA;
5366 LoopInfo &&LI = LIA.run(*F, FAM);
5367
5368 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5369 if (AlignedVars.size()) {
5371 for (auto &AlignedItem : AlignedVars) {
5372 Value *AlignedPtr = AlignedItem.first;
5373 Value *Alignment = AlignedItem.second;
5374 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5375 Builder.SetInsertPoint(loadInst->getNextNode());
5376 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5377 Alignment);
5378 }
5379 Builder.restoreIP(IP);
5380 }
5381
5382 if (IfCond) {
5383 ValueToValueMapTy VMap;
5384 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5385 // Add metadata to the cloned loop which disables vectorization
5386 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5387 assert(MappedLatch &&
5388 "Cannot find value which corresponds to original loop latch");
5389 assert(isa<BasicBlock>(MappedLatch) &&
5390 "Cannot cast mapped latch block value to BasicBlock");
5391 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5392 ConstantAsMetadata *BoolConst =
5395 NewLatchBlock,
5396 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5397 BoolConst})});
5398 }
5399
5400 SmallSet<BasicBlock *, 8> Reachable;
5401
5402 // Get the basic blocks from the loop in which memref instructions
5403 // can be found.
5404 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5405 // preferably without running any passes.
5406 for (BasicBlock *Block : L->getBlocks()) {
5407 if (Block == CanonicalLoop->getCond() ||
5408 Block == CanonicalLoop->getHeader())
5409 continue;
5410 Reachable.insert(Block);
5411 }
5412
5413 SmallVector<Metadata *> LoopMDList;
5414
5415 // In presence of finite 'safelen', it may be unsafe to mark all
5416 // the memory instructions parallel, because loop-carried
5417 // dependences of 'safelen' iterations are possible.
5418 // If clause order(concurrent) is specified then the memory instructions
5419 // are marked parallel even if 'safelen' is finite.
5420 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5421 // Add access group metadata to memory-access instructions.
5422 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5423 for (BasicBlock *BB : Reachable)
5424 addSimdMetadata(BB, AccessGroup, LI);
5425 // TODO: If the loop has existing parallel access metadata, have
5426 // to combine two lists.
5427 LoopMDList.push_back(MDNode::get(
5428 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5429 }
5430
5431 // Use the above access group metadata to create loop level
5432 // metadata, which should be distinct for each loop.
5433 ConstantAsMetadata *BoolConst =
5435 LoopMDList.push_back(MDNode::get(
5436 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5437
5438 if (Simdlen || Safelen) {
5439 // If both simdlen and safelen clauses are specified, the value of the
5440 // simdlen parameter must be less than or equal to the value of the safelen
5441 // parameter. Therefore, use safelen only in the absence of simdlen.
5442 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5443 LoopMDList.push_back(
5444 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5445 ConstantAsMetadata::get(VectorizeWidth)}));
5446 }
5447
5448 addLoopMetadata(CanonicalLoop, LoopMDList);
5449}
5450
5451/// Create the TargetMachine object to query the backend for optimization
5452/// preferences.
5453///
5454/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5455/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5456/// needed for the LLVM pass pipline. We use some default options to avoid
5457/// having to pass too many settings from the frontend that probably do not
5458/// matter.
5459///
5460/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5461/// method. If we are going to use TargetMachine for more purposes, especially
5462/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5463/// might become be worth requiring front-ends to pass on their TargetMachine,
5464/// or at least cache it between methods. Note that while fontends such as Clang
5465/// have just a single main TargetMachine per translation unit, "target-cpu" and
5466/// "target-features" that determine the TargetMachine are per-function and can
5467/// be overrided using __attribute__((target("OPTIONS"))).
5468static std::unique_ptr<TargetMachine>
5470 Module *M = F->getParent();
5471
5472 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5473 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5474 const std::string &Triple = M->getTargetTriple();
5475
5476 std::string Error;
5478 if (!TheTarget)
5479 return {};
5480
5482 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5483 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5484 /*CodeModel=*/std::nullopt, OptLevel));
5485}
5486
5487/// Heuristically determine the best-performant unroll factor for \p CLI. This
5488/// depends on the target processor. We are re-using the same heuristics as the
5489/// LoopUnrollPass.
5491 Function *F = CLI->getFunction();
5492
5493 // Assume the user requests the most aggressive unrolling, even if the rest of
5494 // the code is optimized using a lower setting.
5496 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5497
5499 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5500 FAM.registerPass([]() { return AssumptionAnalysis(); });
5501 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5502 FAM.registerPass([]() { return LoopAnalysis(); });
5503 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5504 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5505 TargetIRAnalysis TIRA;
5506 if (TM)
5507 TIRA = TargetIRAnalysis(
5508 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5509 FAM.registerPass([&]() { return TIRA; });
5510
5511 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5513 ScalarEvolution &&SE = SEA.run(*F, FAM);
5515 DominatorTree &&DT = DTA.run(*F, FAM);
5516 LoopAnalysis LIA;
5517 LoopInfo &&LI = LIA.run(*F, FAM);
5519 AssumptionCache &&AC = ACT.run(*F, FAM);
5521
5522 Loop *L = LI.getLoopFor(CLI->getHeader());
5523 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5524
5526 L, SE, TTI,
5527 /*BlockFrequencyInfo=*/nullptr,
5528 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5529 /*UserThreshold=*/std::nullopt,
5530 /*UserCount=*/std::nullopt,
5531 /*UserAllowPartial=*/true,
5532 /*UserAllowRuntime=*/true,
5533 /*UserUpperBound=*/std::nullopt,
5534 /*UserFullUnrollMaxCount=*/std::nullopt);
5535
5536 UP.Force = true;
5537
5538 // Account for additional optimizations taking place before the LoopUnrollPass
5539 // would unroll the loop.
5542
5543 // Use normal unroll factors even if the rest of the code is optimized for
5544 // size.
5547
5548 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5549 << " Threshold=" << UP.Threshold << "\n"
5550 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5551 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5552 << " PartialOptSizeThreshold="
5553 << UP.PartialOptSizeThreshold << "\n");
5554
5555 // Disable peeling.
5558 /*UserAllowPeeling=*/false,
5559 /*UserAllowProfileBasedPeeling=*/false,
5560 /*UnrollingSpecficValues=*/false);
5561
5563 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5564
5565 // Assume that reads and writes to stack variables can be eliminated by
5566 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5567 // size.
5568 for (BasicBlock *BB : L->blocks()) {
5569 for (Instruction &I : *BB) {
5570 Value *Ptr;
5571 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5572 Ptr = Load->getPointerOperand();
5573 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5574 Ptr = Store->getPointerOperand();
5575 } else
5576 continue;
5577
5578 Ptr = Ptr->stripPointerCasts();
5579
5580 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5581 if (Alloca->getParent() == &F->getEntryBlock())
5582 EphValues.insert(&I);
5583 }
5584 }
5585 }
5586
5587 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5588
5589 // Loop is not unrollable if the loop contains certain instructions.
5590 if (!UCE.canUnroll()) {
5591 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5592 return 1;
5593 }
5594
5595 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5596 << "\n");
5597
5598 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5599 // be able to use it.
5600 int TripCount = 0;
5601 int MaxTripCount = 0;
5602 bool MaxOrZero = false;
5603 unsigned TripMultiple = 0;
5604
5605 bool UseUpperBound = false;
5606 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5607 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5608 UseUpperBound);
5609 unsigned Factor = UP.Count;
5610 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5611
5612 // This function returns 1 to signal to not unroll a loop.
5613 if (Factor == 0)
5614 return 1;
5615 return Factor;
5616}
5617
5619 int32_t Factor,
5620 CanonicalLoopInfo **UnrolledCLI) {
5621 assert(Factor >= 0 && "Unroll factor must not be negative");
5622
5623 Function *F = Loop->getFunction();
5624 LLVMContext &Ctx = F->getContext();
5625
5626 // If the unrolled loop is not used for another loop-associated directive, it
5627 // is sufficient to add metadata for the LoopUnrollPass.
5628 if (!UnrolledCLI) {
5629 SmallVector<Metadata *, 2> LoopMetadata;
5630 LoopMetadata.push_back(
5631 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5632
5633 if (Factor >= 1) {
5635 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5636 LoopMetadata.push_back(MDNode::get(
5637 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5638 }
5639
5640 addLoopMetadata(Loop, LoopMetadata);
5641 return;
5642 }
5643
5644 // Heuristically determine the unroll factor.
5645 if (Factor == 0)
5647
5648 // No change required with unroll factor 1.
5649 if (Factor == 1) {
5650 *UnrolledCLI = Loop;
5651 return;
5652 }
5653
5654 assert(Factor >= 2 &&
5655 "unrolling only makes sense with a factor of 2 or larger");
5656
5657 Type *IndVarTy = Loop->getIndVarType();
5658
5659 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5660 // unroll the inner loop.
5661 Value *FactorVal =
5662 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5663 /*isSigned=*/false));
5664 std::vector<CanonicalLoopInfo *> LoopNest =
5665 tileLoops(DL, {Loop}, {FactorVal});
5666 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5667 *UnrolledCLI = LoopNest[0];
5668 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5669
5670 // LoopUnrollPass can only fully unroll loops with constant trip count.
5671 // Unroll by the unroll factor with a fallback epilog for the remainder
5672 // iterations if necessary.
5674 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5676 InnerLoop,
5677 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5679 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5680
5681#ifndef NDEBUG
5682 (*UnrolledCLI)->assertOK();
5683#endif
5684}
5685
5688 llvm::Value *BufSize, llvm::Value *CpyBuf,
5689 llvm::Value *CpyFn, llvm::Value *DidIt) {
5690 if (!updateToLocation(Loc))
5691 return Loc.IP;
5692
5693 uint32_t SrcLocStrSize;
5694 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5695 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5696 Value *ThreadId = getOrCreateThreadID(Ident);
5697
5698 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5699
5700 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5701
5702 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5703 Builder.CreateCall(Fn, Args);
5704
5705 return Builder.saveIP();
5706}
5707
5709 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5710 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5712
5713 if (!updateToLocation(Loc))
5714 return Loc.IP;
5715
5716 // If needed allocate and initialize `DidIt` with 0.
5717 // DidIt: flag variable: 1=single thread; 0=not single thread.
5718 llvm::Value *DidIt = nullptr;
5719 if (!CPVars.empty()) {
5722 }
5723
5724 Directive OMPD = Directive::OMPD_single;
5725 uint32_t SrcLocStrSize;
5726 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5727 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5728 Value *ThreadId = getOrCreateThreadID(Ident);
5729 Value *Args[] = {Ident, ThreadId};
5730
5731 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5732 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5733
5734 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5735 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5736
5737 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5738 if (Error Err = FiniCB(IP))
5739 return Err;
5740
5741 // The thread that executes the single region must set `DidIt` to 1.
5742 // This is used by __kmpc_copyprivate, to know if the caller is the
5743 // single thread or not.
5744 if (DidIt)
5746
5747 return Error::success();
5748 };
5749
5750 // generates the following:
5751 // if (__kmpc_single()) {
5752 // .... single region ...
5753 // __kmpc_end_single
5754 // }
5755 // __kmpc_copyprivate
5756 // __kmpc_barrier
5757
5758 InsertPointOrErrorTy AfterIP =
5759 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5760 /*Conditional*/ true,
5761 /*hasFinalize*/ true);
5762 if (!AfterIP)
5763 return AfterIP.takeError();
5764
5765 if (DidIt) {
5766 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5767 // NOTE BufSize is currently unused, so just pass 0.
5769 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5770 CPFuncs[I], DidIt);
5771 // NOTE __kmpc_copyprivate already inserts a barrier
5772 } else if (!IsNowait) {
5773 InsertPointOrErrorTy AfterIP =
5775 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5776 /* CheckCancelFlag */ false);
5777 if (!AfterIP)
5778 return AfterIP.takeError();
5779 }
5780 return Builder.saveIP();
5781}
5782
5784 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5785 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5786
5787 if (!updateToLocation(Loc))
5788 return Loc.IP;
5789
5790 Directive OMPD = Directive::OMPD_critical;
5791 uint32_t SrcLocStrSize;
5792 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5793 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5794 Value *ThreadId = getOrCreateThreadID(Ident);
5795 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5796 Value *Args[] = {Ident, ThreadId, LockVar};
5797
5798 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5799 Function *RTFn = nullptr;
5800 if (HintInst) {
5801 // Add Hint to entry Args and create call
5802 EnterArgs.push_back(HintInst);
5803 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5804 } else {
5805 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5806 }
5807 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5808
5809 Function *ExitRTLFn =
5810 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5811 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5812
5813 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5814 /*Conditional*/ false, /*hasFinalize*/ true);
5815}
5816
5819 InsertPointTy AllocaIP, unsigned NumLoops,
5820 ArrayRef<llvm::Value *> StoreValues,
5821 const Twine &Name, bool IsDependSource) {
5822 assert(
5823 llvm::all_of(StoreValues,
5824 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5825 "OpenMP runtime requires depend vec with i64 type");
5826
5827 if (!updateToLocation(Loc))
5828 return Loc.IP;
5829
5830 // Allocate space for vector and generate alloc instruction.
5831 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5832 Builder.restoreIP(AllocaIP);
5833 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5834 ArgsBase->setAlignment(Align(8));
5835 Builder.restoreIP(Loc.IP);
5836
5837 // Store the index value with offset in depend vector.
5838 for (unsigned I = 0; I < NumLoops; ++I) {
5839 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5840 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5841 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5842 STInst->setAlignment(Align(8));
5843 }
5844
5845 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5846 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5847
5848 uint32_t SrcLocStrSize;
5849 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5850 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5851 Value *ThreadId = getOrCreateThreadID(Ident);
5852 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5853
5854 Function *RTLFn = nullptr;
5855 if (IsDependSource)
5856 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5857 else
5858 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5859 Builder.CreateCall(RTLFn, Args);
5860
5861 return Builder.saveIP();
5862}
5863
5865 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5866 FinalizeCallbackTy FiniCB, bool IsThreads) {
5867 if (!updateToLocation(Loc))
5868 return Loc.IP;
5869
5870 Directive OMPD = Directive::OMPD_ordered;
5871 Instruction *EntryCall = nullptr;
5872 Instruction *ExitCall = nullptr;
5873
5874 if (IsThreads) {
5875 uint32_t SrcLocStrSize;
5876 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5877 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5878 Value *ThreadId = getOrCreateThreadID(Ident);
5879 Value *Args[] = {Ident, ThreadId};
5880
5881 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5882 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5883
5884 Function *ExitRTLFn =
5885 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5886 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5887 }
5888
5889 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5890 /*Conditional*/ false, /*hasFinalize*/ true);
5891}
5892
5893OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5894 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5895 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5896 bool HasFinalize, bool IsCancellable) {
5897
5898 if (HasFinalize)
5899 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5900
5901 // Create inlined region's entry and body blocks, in preparation
5902 // for conditional creation
5903 BasicBlock *EntryBB = Builder.GetInsertBlock();
5904 Instruction *SplitPos = EntryBB->getTerminator();
5905 if (!isa_and_nonnull<BranchInst>(SplitPos))
5906 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5907 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5908 BasicBlock *FiniBB =
5909 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5910
5912 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5913
5914 // generate body
5915 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5916 /* CodeGenIP */ Builder.saveIP()))
5917 return Err;
5918
5919 // emit exit call and do any needed finalization.
5920 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5921 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5922 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5923 "Unexpected control flow graph state!!");
5924 InsertPointOrErrorTy AfterIP =
5925 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5926 if (!AfterIP)
5927 return AfterIP.takeError();
5928 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5929 "Unexpected Control Flow State!");
5931
5932 // If we are skipping the region of a non conditional, remove the exit
5933 // block, and clear the builder's insertion point.
5934 assert(SplitPos->getParent() == ExitBB &&
5935 "Unexpected Insertion point location!");
5936 auto merged = MergeBlockIntoPredecessor(ExitBB);
5937 BasicBlock *ExitPredBB = SplitPos->getParent();
5938 auto InsertBB = merged ? ExitPredBB : ExitBB;
5939 if (!isa_and_nonnull<BranchInst>(SplitPos))
5940 SplitPos->eraseFromParent();
5941 Builder.SetInsertPoint(InsertBB);
5942
5943 return Builder.saveIP();
5944}
5945
5946OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5947 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5948 // if nothing to do, Return current insertion point.
5949 if (!Conditional || !EntryCall)
5950 return Builder.saveIP();
5951
5952 BasicBlock *EntryBB = Builder.GetInsertBlock();
5953 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5954 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5955 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5956
5957 // Emit thenBB and set the Builder's insertion point there for
5958 // body generation next. Place the block after the current block.
5959 Function *CurFn = EntryBB->getParent();
5960 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5961
5962 // Move Entry branch to end of ThenBB, and replace with conditional
5963 // branch (If-stmt)
5964 Instruction *EntryBBTI = EntryBB->getTerminator();
5965 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5966 EntryBBTI->removeFromParent();
5968 Builder.Insert(EntryBBTI);
5969 UI->eraseFromParent();
5971
5972 // return an insertion point to ExitBB.
5973 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5974}
5975
5976OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5977 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5978 bool HasFinalize) {
5979
5980 Builder.restoreIP(FinIP);
5981
5982 // If there is finalization to do, emit it before the exit call
5983 if (HasFinalize) {
5984 assert(!FinalizationStack.empty() &&
5985 "Unexpected finalization stack state!");
5986
5987 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5988 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5989
5990 if (Error Err = Fi.FiniCB(FinIP))
5991 return Err;
5992
5993 BasicBlock *FiniBB = FinIP.getBlock();
5994 Instruction *FiniBBTI = FiniBB->getTerminator();
5995
5996 // set Builder IP for call creation
5997 Builder.SetInsertPoint(FiniBBTI);
5998 }
5999
6000 if (!ExitCall)
6001 return Builder.saveIP();
6002
6003 // place the Exitcall as last instruction before Finalization block terminator
6004 ExitCall->removeFromParent();
6005 Builder.Insert(ExitCall);
6006
6007 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6008 ExitCall->getIterator());
6009}
6010
6012 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6013 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6014 if (!IP.isSet())
6015 return IP;
6016
6018
6019 // creates the following CFG structure
6020 // OMP_Entry : (MasterAddr != PrivateAddr)?
6021 // F T
6022 // | \
6023 // | copin.not.master
6024 // | /
6025 // v /
6026 // copyin.not.master.end
6027 // |
6028 // v
6029 // OMP.Entry.Next
6030
6031 BasicBlock *OMP_Entry = IP.getBlock();
6032 Function *CurFn = OMP_Entry->getParent();
6033 BasicBlock *CopyBegin =
6034 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6035 BasicBlock *CopyEnd = nullptr;
6036
6037 // If entry block is terminated, split to preserve the branch to following
6038 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6039 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6040 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6041 "copyin.not.master.end");
6042 OMP_Entry->getTerminator()->eraseFromParent();
6043 } else {
6044 CopyEnd =
6045 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6046 }
6047
6048 Builder.SetInsertPoint(OMP_Entry);
6049 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6050 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6051 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6052 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6053
6054 Builder.SetInsertPoint(CopyBegin);
6055 if (BranchtoEnd)
6057
6058 return Builder.saveIP();
6059}
6060
6062 Value *Size, Value *Allocator,
6063 std::string Name) {
6065 updateToLocation(Loc);
6066
6067 uint32_t SrcLocStrSize;
6068 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6069 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6070 Value *ThreadId = getOrCreateThreadID(Ident);
6071 Value *Args[] = {ThreadId, Size, Allocator};
6072
6073 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6074
6075 return Builder.CreateCall(Fn, Args, Name);
6076}
6077
6079 Value *Addr, Value *Allocator,
6080 std::string Name) {
6082 updateToLocation(Loc);
6083
6084 uint32_t SrcLocStrSize;
6085 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6086 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6087 Value *ThreadId = getOrCreateThreadID(Ident);
6088 Value *Args[] = {ThreadId, Addr, Allocator};
6089 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6090 return Builder.CreateCall(Fn, Args, Name);
6091}
6092
6094 const LocationDescription &Loc, Value *InteropVar,
6095 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6096 Value *DependenceAddress, bool HaveNowaitClause) {
6098 updateToLocation(Loc);
6099
6100 uint32_t SrcLocStrSize;
6101 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6102 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6103 Value *ThreadId = getOrCreateThreadID(Ident);
6104 if (Device == nullptr)
6106 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6107 if (NumDependences == nullptr) {
6108 NumDependences = ConstantInt::get(Int32, 0);
6109 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6110 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6111 }
6112 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6113 Value *Args[] = {
6114 Ident, ThreadId, InteropVar, InteropTypeVal,
6115 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6116
6117 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6118
6119 return Builder.CreateCall(Fn, Args);
6120}
6121
6123 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6124 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6126 updateToLocation(Loc);
6127
6128 uint32_t SrcLocStrSize;
6129 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6130 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6131 Value *ThreadId = getOrCreateThreadID(Ident);
6132 if (Device == nullptr)
6134 if (NumDependences == nullptr) {
6135 NumDependences = ConstantInt::get(Int32, 0);
6136 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6137 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6138 }
6139 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6140 Value *Args[] = {
6141 Ident, ThreadId, InteropVar, Device,
6142 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6143
6144 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6145
6146 return Builder.CreateCall(Fn, Args);
6147}
6148
6150 Value *InteropVar, Value *Device,
6151 Value *NumDependences,
6152 Value *DependenceAddress,
6153 bool HaveNowaitClause) {
6155 updateToLocation(Loc);
6156 uint32_t SrcLocStrSize;
6157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6159 Value *ThreadId = getOrCreateThreadID(Ident);
6160 if (Device == nullptr)
6162 if (NumDependences == nullptr) {
6163 NumDependences = ConstantInt::get(Int32, 0);
6164 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6165 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6166 }
6167 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6168 Value *Args[] = {
6169 Ident, ThreadId, InteropVar, Device,
6170 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6171
6172 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6173
6174 return Builder.CreateCall(Fn, Args);
6175}
6176
6178 const LocationDescription &Loc, llvm::Value *Pointer,
6181 updateToLocation(Loc);
6182
6183 uint32_t SrcLocStrSize;
6184 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6185 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6186 Value *ThreadId = getOrCreateThreadID(Ident);
6187 Constant *ThreadPrivateCache =
6188 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6189 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6190
6191 Function *Fn =
6192 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6193
6194 return Builder.CreateCall(Fn, Args);
6195}
6196
6198 const LocationDescription &Loc,
6200 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6201 "expected num_threads and num_teams to be specified");
6202
6203 if (!updateToLocation(Loc))
6204 return Loc.IP;
6205
6206 uint32_t SrcLocStrSize;
6207 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6208 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6209 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6210 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6211 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6212 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6213 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6214
6215 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6216 Function *Kernel = DebugKernelWrapper;
6217
6218 // We need to strip the debug prefix to get the correct kernel name.
6219 StringRef KernelName = Kernel->getName();
6220 const std::string DebugPrefix = "_debug__";
6221 if (KernelName.ends_with(DebugPrefix)) {
6222 KernelName = KernelName.drop_back(DebugPrefix.length());
6223 Kernel = M.getFunction(KernelName);
6224 assert(Kernel && "Expected the real kernel to exist");
6225 }
6226
6227 // Manifest the launch configuration in the metadata matching the kernel
6228 // environment.
6229 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6230 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6231
6232 // If MaxThreads not set, select the maximum between the default workgroup
6233 // size and the MinThreads value.
6234 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6235 if (MaxThreadsVal < 0)
6236 MaxThreadsVal = std::max(
6237 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6238
6239 if (MaxThreadsVal > 0)
6240 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6241
6242 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6244 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6245 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6246 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6247 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6248
6250 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6251 const DataLayout &DL = Fn->getDataLayout();
6252
6253 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6254 Constant *DynamicEnvironmentInitializer =
6255 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6256 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6257 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6258 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6259 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6260 DL.getDefaultGlobalsAddressSpace());
6261 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6262
6263 Constant *DynamicEnvironment =
6264 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6265 ? DynamicEnvironmentGV
6266 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6267 DynamicEnvironmentPtr);
6268
6269 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6270 ConfigurationEnvironment, {
6271 UseGenericStateMachineVal,
6272 MayUseNestedParallelismVal,
6273 IsSPMDVal,
6274 MinThreads,
6275 MaxThreads,
6276 MinTeams,
6277 MaxTeams,
6278 ReductionDataSize,
6279 ReductionBufferLength,
6280 });
6281 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6282 KernelEnvironment, {
6283 ConfigurationEnvironmentInitializer,
6284 Ident,
6285 DynamicEnvironment,
6286 });
6287 std::string KernelEnvironmentName =
6288 (KernelName + "_kernel_environment").str();
6289 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6290 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6291 KernelEnvironmentInitializer, KernelEnvironmentName,
6292 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6293 DL.getDefaultGlobalsAddressSpace());
6294 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6295
6296 Constant *KernelEnvironment =
6297 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6298 ? KernelEnvironmentGV
6299 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6300 KernelEnvironmentPtr);
6301 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6302 CallInst *ThreadKind =
6303 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6304
6305 Value *ExecUserCode = Builder.CreateICmpEQ(
6306 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6307 "exec_user_code");
6308
6309 // ThreadKind = __kmpc_target_init(...)
6310 // if (ThreadKind == -1)
6311 // user_code
6312 // else
6313 // return;
6314
6315 auto *UI = Builder.CreateUnreachable();
6316 BasicBlock *CheckBB = UI->getParent();
6317 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6318
6319 BasicBlock *WorkerExitBB = BasicBlock::Create(
6320 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6321 Builder.SetInsertPoint(WorkerExitBB);
6323
6324 auto *CheckBBTI = CheckBB->getTerminator();
6325 Builder.SetInsertPoint(CheckBBTI);
6326 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6327
6328 CheckBBTI->eraseFromParent();
6329 UI->eraseFromParent();
6330
6331 // Continue in the "user_code" block, see diagram above and in
6332 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6333 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6334}
6335
6337 int32_t TeamsReductionDataSize,
6338 int32_t TeamsReductionBufferLength) {
6339 if (!updateToLocation(Loc))
6340 return;
6341
6343 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6344
6345 Builder.CreateCall(Fn, {});
6346
6347 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6348 return;
6349
6351 // We need to strip the debug prefix to get the correct kernel name.
6352 StringRef KernelName = Kernel->getName();
6353 const std::string DebugPrefix = "_debug__";
6354 if (KernelName.ends_with(DebugPrefix))
6355 KernelName = KernelName.drop_back(DebugPrefix.length());
6356 auto *KernelEnvironmentGV =
6357 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6358 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6359 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6360 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6361 KernelEnvironmentInitializer,
6362 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6363 NewInitializer = ConstantFoldInsertValueInstruction(
6364 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6365 {0, 8});
6366 KernelEnvironmentGV->setInitializer(NewInitializer);
6367}
6368
6370 Module &M = *Kernel.getParent();
6371 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6372 for (auto *Op : MD->operands()) {
6373 if (Op->getNumOperands() != 3)
6374 continue;
6375 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6376 if (!KernelOp || KernelOp->getValue() != &Kernel)
6377 continue;
6378 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6379 if (!Prop || Prop->getString() != Name)
6380 continue;
6381 return Op;
6382 }
6383 return nullptr;
6384}
6385
6387 bool Min) {
6388 // Update the "maxntidx" metadata for NVIDIA, or add it.
6389 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6390 if (ExistingOp) {
6391 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6392 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6393 ExistingOp->replaceOperandWith(
6394 2, ConstantAsMetadata::get(ConstantInt::get(
6395 OldVal->getValue()->getType(),
6396 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6397 } else {
6398 LLVMContext &Ctx = Kernel.getContext();
6400 MDString::get(Ctx, Name),
6402 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6403 // Append metadata to nvvm.annotations
6404 Module &M = *Kernel.getParent();
6405 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6406 MD->addOperand(MDNode::get(Ctx, MDVals));
6407 }
6408}
6409
6410std::pair<int32_t, int32_t>
6412 int32_t ThreadLimit =
6413 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6414
6415 if (T.isAMDGPU()) {
6416 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6417 if (!Attr.isValid() || !Attr.isStringAttribute())
6418 return {0, ThreadLimit};
6419 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6420 int32_t LB, UB;
6421 if (!llvm::to_integer(UBStr, UB, 10))
6422 return {0, ThreadLimit};
6423 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6424 if (!llvm::to_integer(LBStr, LB, 10))
6425 return {0, UB};
6426 return {LB, UB};
6427 }
6428
6429 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6430 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6431 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6432 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6433 }
6434 return {0, ThreadLimit};
6435}
6436
6438 Function &Kernel, int32_t LB,
6439 int32_t UB) {
6440 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6441
6442 if (T.isAMDGPU()) {
6443 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6444 llvm::utostr(LB) + "," + llvm::utostr(UB));
6445 return;
6446 }
6447
6448 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6449}
6450
6451std::pair<int32_t, int32_t>
6453 // TODO: Read from backend annotations if available.
6454 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6455}
6456
6458 int32_t LB, int32_t UB) {
6459 if (T.isNVPTX())
6460 if (UB > 0)
6461 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
6462 if (T.isAMDGPU())
6463 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6464
6465 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6466}
6467
6468void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6469 Function *OutlinedFn) {
6470 if (Config.isTargetDevice()) {
6472 // TODO: Determine if DSO local can be set to true.
6473 OutlinedFn->setDSOLocal(false);
6475 if (T.isAMDGCN())
6477 else if (T.isNVPTX())
6479 else if (T.isSPIRV())
6481 }
6482}
6483
6484Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6485 StringRef EntryFnIDName) {
6486 if (Config.isTargetDevice()) {
6487 assert(OutlinedFn && "The outlined function must exist if embedded");
6488 return OutlinedFn;
6489 }
6490
6491 return new GlobalVariable(
6492 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6493 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6494}
6495
6496Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6497 StringRef EntryFnName) {
6498 if (OutlinedFn)
6499 return OutlinedFn;
6500
6501 assert(!M.getGlobalVariable(EntryFnName, true) &&
6502 "Named kernel already exists?");
6503 return new GlobalVariable(
6504 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6505 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6506}
6507
6509 TargetRegionEntryInfo &EntryInfo,
6510 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6511 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6512
6513 SmallString<64> EntryFnName;
6514 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6515
6517 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6518 if (!CBResult)
6519 return CBResult.takeError();
6520 OutlinedFn = *CBResult;
6521 } else {
6522 OutlinedFn = nullptr;
6523 }
6524
6525 // If this target outline function is not an offload entry, we don't need to
6526 // register it. This may be in the case of a false if clause, or if there are
6527 // no OpenMP targets.
6528 if (!IsOffloadEntry)
6529 return Error::success();
6530
6531 std::string EntryFnIDName =
6533 ? std::string(EntryFnName)
6534 : createPlatformSpecificName({EntryFnName, "region_id"});
6535
6536 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6537 EntryFnName, EntryFnIDName);
6538 return Error::success();
6539}
6540
6542 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6543 StringRef EntryFnName, StringRef EntryFnIDName) {
6544 if (OutlinedFn)
6545 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6546 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6547 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6549 EntryInfo, EntryAddr, OutlinedFnID,
6551 return OutlinedFnID;
6552}
6553
6555 const LocationDescription &Loc, InsertPointTy AllocaIP,
6556 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6557 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6558 omp::RuntimeFunction *MapperFunc,
6560 BodyGenTy BodyGenType)>
6561 BodyGenCB,
6562 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6563 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6564 if (!updateToLocation(Loc))
6565 return InsertPointTy();
6566
6567 Builder.restoreIP(CodeGenIP);
6568 // Disable TargetData CodeGen on Device pass.
6569 if (Config.IsTargetDevice.value_or(false)) {
6570 if (BodyGenCB) {
6571 InsertPointOrErrorTy AfterIP =
6572 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6573 if (!AfterIP)
6574 return AfterIP.takeError();
6575 Builder.restoreIP(*AfterIP);
6576 }
6577 return Builder.saveIP();
6578 }
6579
6580 bool IsStandAlone = !BodyGenCB;
6581 MapInfosTy *MapInfo;
6582 // Generate the code for the opening of the data environment. Capture all the
6583 // arguments of the runtime call by reference because they are used in the
6584 // closing of the region.
6585 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6586 InsertPointTy CodeGenIP) -> Error {
6587 MapInfo = &GenMapInfoCB(Builder.saveIP());
6588 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6589 /*IsNonContiguous=*/true, DeviceAddrCB,
6590 CustomMapperCB);
6591
6592 TargetDataRTArgs RTArgs;
6594
6595 // Emit the number of elements in the offloading arrays.
6596 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6597
6598 // Source location for the ident struct
6599 if (!SrcLocInfo) {
6600 uint32_t SrcLocStrSize;
6601 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6602 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6603 }
6604
6605 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6606 SrcLocInfo, DeviceID,
6607 PointerNum, RTArgs.BasePointersArray,
6608 RTArgs.PointersArray, RTArgs.SizesArray,
6609 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6610 RTArgs.MappersArray};
6611
6612 if (IsStandAlone) {
6613 assert(MapperFunc && "MapperFunc missing for standalone target data");
6614
6615 auto TaskBodyCB = [&](Value *, Value *,
6617 if (Info.HasNoWait) {
6618 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6622 }
6623
6625 OffloadingArgs);
6626
6627 if (Info.HasNoWait) {
6628 BasicBlock *OffloadContBlock =
6629 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6631 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6633 }
6634 return Error::success();
6635 };
6636
6637 bool RequiresOuterTargetTask = Info.HasNoWait;
6638 if (!RequiresOuterTargetTask)
6639 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6640 /*TargetTaskAllocaIP=*/{}));
6641 else
6642 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6643 /*Dependencies=*/{}, Info.HasNoWait));
6644 } else {
6645 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6646 omp::OMPRTL___tgt_target_data_begin_mapper);
6647
6648 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6649
6650 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6651 if (isa<AllocaInst>(DeviceMap.second.second)) {
6652 auto *LI =
6653 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6654 Builder.CreateStore(LI, DeviceMap.second.second);
6655 }
6656 }
6657
6658 // If device pointer privatization is required, emit the body of the
6659 // region here. It will have to be duplicated: with and without
6660 // privatization.
6661 InsertPointOrErrorTy AfterIP =
6662 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6663 if (!AfterIP)
6664 return AfterIP.takeError();
6665 Builder.restoreIP(*AfterIP);
6666 }
6667 return Error::success();
6668 };
6669
6670 // If we need device pointer privatization, we need to emit the body of the
6671 // region with no privatization in the 'else' branch of the conditional.
6672 // Otherwise, we don't have to do anything.
6673 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6674 InsertPointTy CodeGenIP) -> Error {
6675 InsertPointOrErrorTy AfterIP =
6676 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6677 if (!AfterIP)
6678 return AfterIP.takeError();
6679 Builder.restoreIP(*AfterIP);
6680 return Error::success();
6681 };
6682
6683 // Generate code for the closing of the data region.
6684 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6685 TargetDataRTArgs RTArgs;
6686 Info.EmitDebug = !MapInfo->Names.empty();
6687 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6688
6689 // Emit the number of elements in the offloading arrays.
6690 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6691
6692 // Source location for the ident struct
6693 if (!SrcLocInfo) {
6694 uint32_t SrcLocStrSize;
6695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6696 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6697 }
6698
6699 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6700 PointerNum, RTArgs.BasePointersArray,
6701 RTArgs.PointersArray, RTArgs.SizesArray,
6702 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6703 RTArgs.MappersArray};
6704 Function *EndMapperFunc =
6705 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6706
6707 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6708 return Error::success();
6709 };
6710
6711 // We don't have to do anything to close the region if the if clause evaluates
6712 // to false.
6713 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6714 return Error::success();
6715 };
6716
6717 Error Err = [&]() -> Error {
6718 if (BodyGenCB) {
6719 Error Err = [&]() {
6720 if (IfCond)
6721 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6722 return BeginThenGen(AllocaIP, Builder.saveIP());
6723 }();
6724
6725 if (Err)
6726 return Err;
6727
6728 // If we don't require privatization of device pointers, we emit the body
6729 // in between the runtime calls. This avoids duplicating the body code.
6730 InsertPointOrErrorTy AfterIP =
6731 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6732 if (!AfterIP)
6733 return AfterIP.takeError();
6734 Builder.restoreIP(*AfterIP);
6735
6736 if (IfCond)
6737 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6738 return EndThenGen(AllocaIP, Builder.saveIP());
6739 }
6740 if (IfCond)
6741 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6742 return BeginThenGen(AllocaIP, Builder.saveIP());
6743 }();
6744
6745 if (Err)
6746 return Err;
6747
6748 return Builder.saveIP();
6749}
6750
6753 bool IsGPUDistribute) {
6754 assert((IVSize == 32 || IVSize == 64) &&
6755 "IV size is not compatible with the omp runtime");
6757 if (IsGPUDistribute)
6758 Name = IVSize == 32
6759 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6760 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6761 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6762 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6763 else
6764 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6765 : omp::OMPRTL___kmpc_for_static_init_4u)
6766 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6767 : omp::OMPRTL___kmpc_for_static_init_8u);
6768
6770}
6771
6773 bool IVSigned) {
6774 assert((IVSize == 32 || IVSize == 64) &&
6775 "IV size is not compatible with the omp runtime");
6776 RuntimeFunction Name = IVSize == 32
6777 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6778 : omp::OMPRTL___kmpc_dispatch_init_4u)
6779 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6780 : omp::OMPRTL___kmpc_dispatch_init_8u);
6781
6783}
6784
6786 bool IVSigned) {
6787 assert((IVSize == 32 || IVSize == 64) &&
6788 "IV size is not compatible with the omp runtime");
6789 RuntimeFunction Name = IVSize == 32
6790 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6791 : omp::OMPRTL___kmpc_dispatch_next_4u)
6792 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6793 : omp::OMPRTL___kmpc_dispatch_next_8u);
6794
6796}
6797
6799 bool IVSigned) {
6800 assert((IVSize == 32 || IVSize == 64) &&
6801 "IV size is not compatible with the omp runtime");
6802 RuntimeFunction Name = IVSize == 32
6803 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6804 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6805 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6806 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6807
6809}
6810
6812 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6813}
6814
6816 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
6817 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
6818
6819 DISubprogram *NewSP = Func->getSubprogram();
6820 if (!NewSP)
6821 return;
6822
6825
6826 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
6827 auto NewSP = Func->getSubprogram();
6828 DILocalVariable *&NewVar = RemappedVariables[OldVar];
6829 // Only use cached variable if the arg number matches. This is important
6830 // so that DIVariable created for privatized variables are not discarded.
6831 if (NewVar && (arg == NewVar->getArg()))
6832 return NewVar;
6833
6835 *OldVar->getScope(), *NewSP, Builder.getContext(), Cache);
6837 Builder.getContext(), NewScope, OldVar->getName(), OldVar->getFile(),
6838 OldVar->getLine(), OldVar->getType(), arg, OldVar->getFlags(),
6839 OldVar->getAlignInBits(), OldVar->getAnnotations());
6840 return NewVar;
6841 };
6842
6843 auto UpdateDebugRecord = [&](auto *DR) {
6844 DILocalVariable *OldVar = DR->getVariable();
6845 unsigned ArgNo = 0;
6846 for (auto Loc : DR->location_ops()) {
6847 auto Iter = ValueReplacementMap.find(Loc);
6848 if (Iter != ValueReplacementMap.end()) {
6849 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
6850 ArgNo = std::get<1>(Iter->second) + 1;
6851 }
6852 }
6853 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
6854 };
6855
6856 // The location and scope of variable intrinsics and records still point to
6857 // the parent function of the target region. Update them.
6858 for (Instruction &I : instructions(Func)) {
6859 if (auto *DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&I))
6860 UpdateDebugRecord(DDI);
6861
6862 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
6863 UpdateDebugRecord(&DVR);
6864 }
6865 // An extra argument is passed to the device. Create the debug data for it.
6866 if (OMPBuilder.Config.isTargetDevice()) {
6867 DICompileUnit *CU = NewSP->getUnit();
6868 Module *M = Func->getParent();
6869 DIBuilder DB(*M, true, CU);
6870 DIType *VoidPtrTy =
6871 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
6872 DILocalVariable *Var = DB.createParameterVariable(
6873 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
6874 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
6875 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
6876 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
6877 &(*Func->begin()));
6878 }
6879}
6880
6882 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6884 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
6887 SmallVector<Type *> ParameterTypes;
6888 if (OMPBuilder.Config.isTargetDevice()) {
6889 // Add the "implicit" runtime argument we use to provide launch specific
6890 // information for target devices.
6891 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6892 ParameterTypes.push_back(Int8PtrTy);
6893
6894 // All parameters to target devices are passed as pointers
6895 // or i64. This assumes 64-bit address spaces/pointers.
6896 for (auto &Arg : Inputs)
6897 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6898 ? Arg->getType()
6899 : Type::getInt64Ty(Builder.getContext()));
6900 } else {
6901 for (auto &Arg : Inputs)
6902 ParameterTypes.push_back(Arg->getType());
6903 }
6904
6905 auto BB = Builder.GetInsertBlock();
6906 auto M = BB->getModule();
6907 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6908 /*isVarArg*/ false);
6909 auto Func =
6910 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6911
6912 // Forward target-cpu and target-features function attributes from the
6913 // original function to the new outlined function.
6914 Function *ParentFn = Builder.GetInsertBlock()->getParent();
6915
6916 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
6917 if (TargetCpuAttr.isStringAttribute())
6918 Func->addFnAttr(TargetCpuAttr);
6919
6920 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
6921 if (TargetFeaturesAttr.isStringAttribute())
6922 Func->addFnAttr(TargetFeaturesAttr);
6923
6924 if (OMPBuilder.Config.isTargetDevice()) {
6925 Value *ExecMode =
6926 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
6927 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
6928 }
6929
6930 // Save insert point.
6931 IRBuilder<>::InsertPointGuard IPG(Builder);
6932 // If there's a DISubprogram associated with current function, then
6933 // generate one for the outlined function.
6934 if (Function *ParentFunc = BB->getParent()) {
6935 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6936 DICompileUnit *CU = SP->getUnit();
6937 DIBuilder DB(*M, true, CU);
6939 if (DL) {
6940 // TODO: We are using nullopt for arguments at the moment. This will
6941 // need to be updated when debug data is being generated for variables.
6942 DISubroutineType *Ty =
6943 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6944 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6945 DISubprogram::SPFlagOptimized |
6946 DISubprogram::SPFlagLocalToUnit;
6947
6948 DISubprogram *OutlinedSP = DB.createFunction(
6949 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6950 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6951
6952 // Attach subprogram to the function.
6953 Func->setSubprogram(OutlinedSP);
6954 // Update the CurrentDebugLocation in the builder so that right scope
6955 // is used for things inside outlined function.
6957 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6958 OutlinedSP, DL.getInlinedAt()));
6959 }
6960 }
6961 }
6962
6963 // Generate the region into the function.
6964 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6965 Builder.SetInsertPoint(EntryBB);
6966
6967 // Insert target init call in the device compilation pass.
6968 if (OMPBuilder.Config.isTargetDevice())
6969 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
6970
6971 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6972
6973 // As we embed the user code in the middle of our target region after we
6974 // generate entry code, we must move what allocas we can into the entry
6975 // block to avoid possible breaking optimisations for device
6976 if (OMPBuilder.Config.isTargetDevice())
6978
6979 // Insert target deinit call in the device compilation pass.
6980 BasicBlock *OutlinedBodyBB =
6981 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6983 Builder.saveIP(),
6984 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6985 if (!AfterIP)
6986 return AfterIP.takeError();
6987 Builder.restoreIP(*AfterIP);
6988 if (OMPBuilder.Config.isTargetDevice())
6989 OMPBuilder.createTargetDeinit(Builder);
6990
6991 // Insert return instruction.
6992 Builder.CreateRetVoid();
6993
6994 // New Alloca IP at entry point of created device function.
6995 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
6996 auto AllocaIP = Builder.saveIP();
6997
6998 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6999
7000 // Skip the artificial dyn_ptr on the device.
7001 const auto &ArgRange =
7002 OMPBuilder.Config.isTargetDevice()
7003 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7004 : Func->args();
7005
7007
7008 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7009 // Things like GEP's can come in the form of Constants. Constants and
7010 // ConstantExpr's do not have access to the knowledge of what they're
7011 // contained in, so we must dig a little to find an instruction so we
7012 // can tell if they're used inside of the function we're outlining. We
7013 // also replace the original constant expression with a new instruction
7014 // equivalent; an instruction as it allows easy modification in the
7015 // following loop, as we can now know the constant (instruction) is
7016 // owned by our target function and replaceUsesOfWith can now be invoked
7017 // on it (cannot do this with constants it seems). A brand new one also
7018 // allows us to be cautious as it is perhaps possible the old expression
7019 // was used inside of the function but exists and is used externally
7020 // (unlikely by the nature of a Constant, but still).
7021 // NOTE: We cannot remove dead constants that have been rewritten to
7022 // instructions at this stage, we run the risk of breaking later lowering
7023 // by doing so as we could still be in the process of lowering the module
7024 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7025 // constants we have created rewritten versions of.
7026 if (auto *Const = dyn_cast<Constant>(Input))
7027 convertUsersOfConstantsToInstructions(Const, Func, false);
7028
7029 // Collect all the instructions
7030 for (User *User : make_early_inc_range(Input->users()))
7031 if (auto *Instr = dyn_cast<Instruction>(User))
7032 if (Instr->getFunction() == Func)
7033 Instr->replaceUsesOfWith(Input, InputCopy);
7034 };
7035
7036 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7037
7038 // Rewrite uses of input valus to parameters.
7039 for (auto InArg : zip(Inputs, ArgRange)) {
7040 Value *Input = std::get<0>(InArg);
7041 Argument &Arg = std::get<1>(InArg);
7042 Value *InputCopy = nullptr;
7043
7045 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7046 if (!AfterIP)
7047 return AfterIP.takeError();
7048 Builder.restoreIP(*AfterIP);
7049 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7050
7051 // In certain cases a Global may be set up for replacement, however, this
7052 // Global may be used in multiple arguments to the kernel, just segmented
7053 // apart, for example, if we have a global array, that is sectioned into
7054 // multiple mappings (technically not legal in OpenMP, but there is a case
7055 // in Fortran for Common Blocks where this is neccesary), we will end up
7056 // with GEP's into this array inside the kernel, that refer to the Global
7057 // but are technically seperate arguments to the kernel for all intents and
7058 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7059 // index, it will fold into an referal to the Global, if we then encounter
7060 // this folded GEP during replacement all of the references to the
7061 // Global in the kernel will be replaced with the argument we have generated
7062 // that corresponds to it, including any other GEP's that refer to the
7063 // Global that may be other arguments. This will invalidate all of the other
7064 // preceding mapped arguments that refer to the same global that may be
7065 // seperate segments. To prevent this, we defer global processing until all
7066 // other processing has been performed.
7067 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
7068 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
7069 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
7070 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7071 continue;
7072 }
7073
7074 ReplaceValue(Input, InputCopy, Func);
7075 }
7076
7077 // Replace all of our deferred Input values, currently just Globals.
7078 for (auto Deferred : DeferredReplacement)
7079 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7080
7081 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7082 ValueReplacementMap);
7083 return Func;
7084}
7085
7086/// Create an entry point for a target task with the following.
7087/// It'll have the following signature
7088/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7089/// This function is called from emitTargetTask once the
7090/// code to launch the target kernel has been outlined already.
7092 IRBuilderBase &Builder,
7093 CallInst *StaleCI) {
7094 Module &M = OMPBuilder.M;
7095 // KernelLaunchFunction is the target launch function, i.e.
7096 // the function that sets up kernel arguments and calls
7097 // __tgt_target_kernel to launch the kernel on the device.
7098 //
7099 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7100
7101 // StaleCI is the CallInst which is the call to the outlined
7102 // target kernel launch function. If there are values that the
7103 // outlined function uses then these are aggregated into a structure
7104 // which is passed as the second argument. If not, then there's
7105 // only one argument, the threadID. So, StaleCI can be
7106 //
7107 // %structArg = alloca { ptr, ptr }, align 8
7108 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7109 // store ptr %20, ptr %gep_, align 8
7110 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7111 // store ptr %21, ptr %gep_8, align 8
7112 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7113 //
7114 // OR
7115 //
7116 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7118 StaleCI->getIterator());
7119 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7120 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7121 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7122 Type *TaskTy = OMPBuilder.Task;
7123 auto ProxyFnTy =
7124 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7125 /* isVarArg */ false);
7126 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7127 ".omp_target_task_proxy_func",
7128 Builder.GetInsertBlock()->getModule());
7129 ProxyFn->getArg(0)->setName("thread.id");
7130 ProxyFn->getArg(1)->setName("task");
7131
7132 BasicBlock *EntryBB =
7133 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7134 Builder.SetInsertPoint(EntryBB);
7135
7136 bool HasShareds = StaleCI->arg_size() > 1;
7137 // TODO: This is a temporary assert to prove to ourselves that
7138 // the outlined target launch function is always going to have
7139 // atmost two arguments if there is any data shared between
7140 // host and device.
7141 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
7142 "StaleCI with shareds should have exactly two arguments.");
7143 if (HasShareds) {
7144 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7145 assert(ArgStructAlloca &&
7146 "Unable to find the alloca instruction corresponding to arguments "
7147 "for extracted function");
7148 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7149
7150 AllocaInst *NewArgStructAlloca =
7151 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7152 Value *TaskT = ProxyFn->getArg(1);
7153 Value *ThreadId = ProxyFn->getArg(0);
7154 Value *SharedsSize =
7155 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7156
7157 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7158 LoadInst *LoadShared =
7159 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7160
7161 Builder.CreateMemCpy(
7162 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7163 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7164
7165 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7166 }
7167 Builder.CreateRetVoid();
7168 return ProxyFn;
7169}
7170
7172 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7173 TargetRegionEntryInfo &EntryInfo,
7175 Function *&OutlinedFn, Constant *&OutlinedFnID,
7179
7180 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7181 [&](StringRef EntryFnName) {
7182 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7183 EntryFnName, Inputs, CBFunc,
7184 ArgAccessorFuncCB);
7185 };
7186
7187 return OMPBuilder.emitTargetRegionFunction(
7188 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7189 OutlinedFnID);
7190}
7191
7193 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7196 bool HasNoWait) {
7197
7198 // The following explains the code-gen scenario for the `target` directive. A
7199 // similar scneario is followed for other device-related directives (e.g.
7200 // `target enter data`) but in similar fashion since we only need to emit task
7201 // that encapsulates the proper runtime call.
7202 //
7203 // When we arrive at this function, the target region itself has been
7204 // outlined into the function OutlinedFn.
7205 // So at ths point, for
7206 // --------------------------------------------------
7207 // void user_code_that_offloads(...) {
7208 // omp target depend(..) map(from:a) map(to:b, c)
7209 // a = b + c
7210 // }
7211 //
7212 // --------------------------------------------------
7213 //
7214 // we have
7215 //
7216 // --------------------------------------------------
7217 //
7218 // void user_code_that_offloads(...) {
7219 // %.offload_baseptrs = alloca [3 x ptr], align 8
7220 // %.offload_ptrs = alloca [3 x ptr], align 8
7221 // %.offload_mappers = alloca [3 x ptr], align 8
7222 // ;; target region has been outlined and now we need to
7223 // ;; offload to it via a target task.
7224 // }
7225 // void outlined_device_function(ptr a, ptr b, ptr c) {
7226 // *a = *b + *c
7227 // }
7228 //
7229 // We have to now do the following
7230 // (i) Make an offloading call to outlined_device_function using the OpenMP
7231 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7232 // emitted by emitKernelLaunch
7233 // (ii) Create a task entry point function that calls kernel_launch_function
7234 // and is the entry point for the target task. See
7235 // '@.omp_target_task_proxy_func in the pseudocode below.
7236 // (iii) Create a task with the task entry point created in (ii)
7237 //
7238 // That is we create the following
7239 //
7240 // void user_code_that_offloads(...) {
7241 // %.offload_baseptrs = alloca [3 x ptr], align 8
7242 // %.offload_ptrs = alloca [3 x ptr], align 8
7243 // %.offload_mappers = alloca [3 x ptr], align 8
7244 //
7245 // %structArg = alloca { ptr, ptr, ptr }, align 8
7246 // %strucArg[0] = %.offload_baseptrs
7247 // %strucArg[1] = %.offload_ptrs
7248 // %strucArg[2] = %.offload_mappers
7249 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7250 // @.omp_target_task_proxy_func)
7251 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7252 // dependencies_array = ...
7253 // ;; if nowait not present
7254 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7255 // call @__kmpc_omp_task_begin_if0(...)
7256 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7257 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7258 // }
7259 //
7260 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7261 // ptr %task) {
7262 // %structArg = alloca {ptr, ptr, ptr}
7263 // %shared_data = load (getelementptr %task, 0, 0)
7264 // mempcy(%structArg, %shared_data, sizeof(structArg))
7265 // kernel_launch_function(%thread.id, %structArg)
7266 // }
7267 //
7268 // We need the proxy function because the signature of the task entry point
7269 // expected by kmpc_omp_task is always the same and will be different from
7270 // that of the kernel_launch function.
7271 //
7272 // kernel_launch_function is generated by emitKernelLaunch and has the
7273 // always_inline attribute.
7274 // void kernel_launch_function(thread_id,
7275 // structArg) alwaysinline {
7276 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7277 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7278 // offload_ptrs = load(getelementptr structArg, 0, 1)
7279 // offload_mappers = load(getelementptr structArg, 0, 2)
7280 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7281 // ; offload_mappers
7282 // call i32 @__tgt_target_kernel(...,
7283 // outlined_device_function,
7284 // ptr %kernel_args)
7285 // }
7286 // void outlined_device_function(ptr a, ptr b, ptr c) {
7287 // *a = *b + *c
7288 // }
7289 //
7290 BasicBlock *TargetTaskBodyBB =
7291 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7292 BasicBlock *TargetTaskAllocaBB =
7293 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7294
7295 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7296 TargetTaskAllocaBB->begin());
7297 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7298
7299 OutlineInfo OI;
7300 OI.EntryBB = TargetTaskAllocaBB;
7301 OI.OuterAllocaBB = AllocaIP.getBlock();
7302
7303 // Add the thread ID argument.
7306 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7307
7308 Builder.restoreIP(TargetTaskBodyIP);
7309
7310 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7311 return Err;
7312
7313 OI.ExitBB = Builder.saveIP().getBlock();
7314 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7315 DeviceID](Function &OutlinedFn) mutable {
7316 assert(OutlinedFn.getNumUses() == 1 &&
7317 "there must be a single user for the outlined function");
7318
7319 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7320 bool HasShareds = StaleCI->arg_size() > 1;
7321
7322 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7323
7324 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7325 << "\n");
7326
7327 Builder.SetInsertPoint(StaleCI);
7328
7329 // Gather the arguments for emitting the runtime call.
7330 uint32_t SrcLocStrSize;
7331 Constant *SrcLocStr =
7333 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7334
7335 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7336 //
7337 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7338 // the DeviceID to the deferred task and also since
7339 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7340 bool NeedsTargetTask = HasNoWait && DeviceID;
7341 Function *TaskAllocFn =
7342 !NeedsTargetTask
7343 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7345 OMPRTL___kmpc_omp_target_task_alloc);
7346
7347 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7348 // call.
7349 Value *ThreadID = getOrCreateThreadID(Ident);
7350
7351 // Argument - `sizeof_kmp_task_t` (TaskSize)
7352 // Tasksize refers to the size in bytes of kmp_task_t data structure
7353 // including private vars accessed in task.
7354 // TODO: add kmp_task_t_with_privates (privates)
7355 Value *TaskSize =
7357
7358 // Argument - `sizeof_shareds` (SharedsSize)
7359 // SharedsSize refers to the shareds array size in the kmp_task_t data
7360 // structure.
7361 Value *SharedsSize = Builder.getInt64(0);
7362 if (HasShareds) {
7363 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7364 assert(ArgStructAlloca &&
7365 "Unable to find the alloca instruction corresponding to arguments "
7366 "for extracted function");
7367 auto *ArgStructType =
7368 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7369 assert(ArgStructType && "Unable to find struct type corresponding to "
7370 "arguments for extracted function");
7371 SharedsSize =
7373 }
7374
7375 // Argument - `flags`
7376 // Task is tied iff (Flags & 1) == 1.
7377 // Task is untied iff (Flags & 1) == 0.
7378 // Task is final iff (Flags & 2) == 2.
7379 // Task is not final iff (Flags & 2) == 0.
7380 // A target task is not final and is untied.
7382
7383 // Emit the @__kmpc_omp_task_alloc runtime call
7384 // The runtime call returns a pointer to an area where the task captured
7385 // variables must be copied before the task is run (TaskData)
7386 CallInst *TaskData = nullptr;
7387
7388 SmallVector<llvm::Value *> TaskAllocArgs = {
7389 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7390 /*flags=*/Flags,
7391 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7392 /*task_func=*/ProxyFn};
7393
7394 if (NeedsTargetTask) {
7395 assert(DeviceID && "Expected non-empty device ID.");
7396 TaskAllocArgs.push_back(DeviceID);
7397 }
7398
7399 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7400
7401 if (HasShareds) {
7402 Value *Shareds = StaleCI->getArgOperand(1);
7403 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7404 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7405 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7406 SharedsSize);
7407 }
7408
7409 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7410
7411 // ---------------------------------------------------------------
7412 // V5.2 13.8 target construct
7413 // If the nowait clause is present, execution of the target task
7414 // may be deferred. If the nowait clause is not present, the target task is
7415 // an included task.
7416 // ---------------------------------------------------------------
7417 // The above means that the lack of a nowait on the target construct
7418 // translates to '#pragma omp task if(0)'
7419 if (!NeedsTargetTask) {
7420 if (DepArray) {
7421 Function *TaskWaitFn =
7422 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7424 TaskWaitFn,
7425 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7426 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7427 /*dep_list=*/DepArray,
7428 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7429 /*noalias_dep_list=*/
7431 }
7432 // Included task.
7433 Function *TaskBeginFn =
7434 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7435 Function *TaskCompleteFn =
7436 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7437 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7438 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7439 CI->setDebugLoc(StaleCI->getDebugLoc());
7440 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7441 } else if (DepArray) {
7442 // HasNoWait - meaning the task may be deferred. Call
7443 // __kmpc_omp_task_with_deps if there are dependencies,
7444 // else call __kmpc_omp_task
7445 Function *TaskFn =
7446 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7448 TaskFn,
7449 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7450 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7452 } else {
7453 // Emit the @__kmpc_omp_task runtime call to spawn the task
7454 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7455 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7456 }
7457
7458 StaleCI->eraseFromParent();
7459 for (Instruction *I : llvm::reverse(ToBeDeleted))
7460 I->eraseFromParent();
7461 };
7462 addOutlineInfo(std::move(OI));
7463
7464 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7465 << *(Builder.GetInsertBlock()) << "\n");
7466 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7468 << "\n");
7469 return Builder.saveIP();
7470}
7471
7473 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7474 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7475 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7476 function_ref<Value *(unsigned int)> CustomMapperCB) {
7477 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7478 DeviceAddrCB, CustomMapperCB);
7479 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7480}
7481
7482static void
7487 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
7491 bool HasNoWait = false) {
7492 // Generate a function call to the host fallback implementation of the target
7493 // region. This is called by the host when no offload entry was generated for
7494 // the target region and when the offloading call fails at runtime.
7495 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7497 Builder.restoreIP(IP);
7498 Builder.CreateCall(OutlinedFn, Args);
7499 return Builder.saveIP();
7500 };
7501
7502 bool HasDependencies = Dependencies.size() > 0;
7503 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7504
7506
7507 auto TaskBodyCB =
7508 [&](Value *DeviceID, Value *RTLoc,
7509 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7510 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7511 // produce any.
7513 // emitKernelLaunch makes the necessary runtime call to offload the
7514 // kernel. We then outline all that code into a separate function
7515 // ('kernel_launch_function' in the pseudo code above). This function is
7516 // then called by the target task proxy function (see
7517 // '@.omp_target_task_proxy_func' in the pseudo code above)
7518 // "@.omp_target_task_proxy_func' is generated by
7519 // emitTargetTaskProxyFunction.
7520 if (OutlinedFnID && DeviceID)
7521 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7522 EmitTargetCallFallbackCB, KArgs,
7523 DeviceID, RTLoc, TargetTaskAllocaIP);
7524
7525 // We only need to do the outlining if `DeviceID` is set to avoid calling
7526 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
7527 // generating the `else` branch of an `if` clause.
7528 //
7529 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7530 // In this case, we execute the host implementation directly.
7531 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7532 }());
7533
7534 OMPBuilder.Builder.restoreIP(AfterIP);
7535 return Error::success();
7536 };
7537
7538 auto &&EmitTargetCallElse =
7539 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7541 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7542 // produce any.
7544 if (RequiresOuterTargetTask) {
7545 // Arguments that are intended to be directly forwarded to an
7546 // emitKernelLaunch call are pased as nullptr, since
7547 // OutlinedFnID=nullptr results in that call not being done.
7548 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7549 /*RTLoc=*/nullptr, AllocaIP,
7550 Dependencies, HasNoWait);
7551 }
7552 return EmitTargetCallFallbackCB(Builder.saveIP());
7553 }());
7554
7555 Builder.restoreIP(AfterIP);
7556 return Error::success();
7557 };
7558
7559 auto &&EmitTargetCallThen =
7560 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7563 /*RequiresDevicePointerInfo=*/false,
7564 /*SeparateBeginEndCalls=*/true);
7565
7566 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7568 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7569 RTArgs, MapInfo,
7570 /*IsNonContiguous=*/true,
7571 /*ForEndCall=*/false);
7572
7573 SmallVector<Value *, 3> NumTeamsC;
7574 for (auto [DefaultVal, RuntimeVal] :
7575 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
7576 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
7577 : Builder.getInt32(DefaultVal));
7578
7579 // Calculate number of threads: 0 if no clauses specified, otherwise it is
7580 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
7581 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
7582 if (Clause)
7583 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
7584 /*isSigned=*/false);
7585 return Clause;
7586 };
7587 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
7588 if (Clause)
7589 Result =
7590 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
7591 Result, Clause)
7592 : Clause;
7593 };
7594
7595 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
7596 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
7597 SmallVector<Value *, 3> NumThreadsC;
7598 Value *MaxThreadsClause =
7599 RuntimeAttrs.TeamsThreadLimit.size() == 1
7600 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
7601 : nullptr;
7602
7603 for (auto [TeamsVal, TargetVal] : zip_equal(
7604 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
7605 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
7606 Value *NumThreads = InitMaxThreadsClause(TargetVal);
7607
7608 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
7609 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
7610
7611 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
7612 }
7613
7614 unsigned NumTargetItems = Info.NumberOfPtrs;
7615 // TODO: Use correct device ID
7616 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7617 uint32_t SrcLocStrSize;
7618 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7619 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7620 llvm::omp::IdentFlag(0), 0);
7621
7622 Value *TripCount = RuntimeAttrs.LoopTripCount
7623 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
7624 Builder.getInt64Ty(),
7625 /*isSigned=*/false)
7626 : Builder.getInt64(0);
7627
7628 // TODO: Use correct DynCGGroupMem
7629 Value *DynCGGroupMem = Builder.getInt32(0);
7630
7631 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
7632 NumTeamsC, NumThreadsC,
7633 DynCGGroupMem, HasNoWait);
7634
7635 // Assume no error was returned because TaskBodyCB and
7636 // EmitTargetCallFallbackCB don't produce any.
7638 // The presence of certain clauses on the target directive require the
7639 // explicit generation of the target task.
7640 if (RequiresOuterTargetTask)
7641 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7642 Dependencies, HasNoWait);
7643
7644 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7645 EmitTargetCallFallbackCB, KArgs,
7646 DeviceID, RTLoc, AllocaIP);
7647 }());
7648
7649 Builder.restoreIP(AfterIP);
7650 return Error::success();
7651 };
7652
7653 // If we don't have an ID for the target region, it means an offload entry
7654 // wasn't created. In this case we just run the host fallback directly and
7655 // ignore any potential 'if' clauses.
7656 if (!OutlinedFnID) {
7657 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
7658 return;
7659 }
7660
7661 // If there's no 'if' clause, only generate the kernel launch code path.
7662 if (!IfCond) {
7663 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
7664 return;
7665 }
7666
7667 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
7668 EmitTargetCallElse, AllocaIP));
7669}
7670
7672 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7673 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7674 const TargetKernelDefaultAttrs &DefaultAttrs,
7675 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
7679 SmallVector<DependData> Dependencies, bool HasNowait) {
7680
7681 if (!updateToLocation(Loc))
7682 return InsertPointTy();
7683
7684 Builder.restoreIP(CodeGenIP);
7685
7686 Function *OutlinedFn;
7687 Constant *OutlinedFnID = nullptr;
7688 // The target region is outlined into its own function. The LLVM IR for
7689 // the target region itself is generated using the callbacks CBFunc
7690 // and ArgAccessorFuncCB
7692 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
7693 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
7694 return Err;
7695
7696 // If we are not on the target device, then we need to generate code
7697 // to make a remote call (offload) to the previously outlined function
7698 // that represents the target region. Do that now.
7699 if (!Config.isTargetDevice())
7700 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond,
7701 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
7702 HasNowait);
7703 return Builder.saveIP();
7704}
7705
7706std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7707 StringRef FirstSeparator,
7708 StringRef Separator) {
7709 SmallString<128> Buffer;
7711 StringRef Sep = FirstSeparator;
7712 for (StringRef Part : Parts) {
7713 OS << Sep << Part;
7714 Sep = Separator;
7715 }
7716 return OS.str().str();
7717}
7718
7719std::string
7721 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7722 Config.separator());
7723}
7724
7727 unsigned AddressSpace) {
7728 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7729 if (Elem.second) {
7730 assert(Elem.second->getValueType() == Ty &&
7731 "OMP internal variable has different type than requested");
7732 } else {
7733 // TODO: investigate the appropriate linkage type used for the global
7734 // variable for possibly changing that to internal or private, or maybe
7735 // create different versions of the function for different OMP internal
7736 // variables.
7737 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7740 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7741 Constant::getNullValue(Ty), Elem.first(),
7742 /*InsertBefore=*/nullptr,
7744 const DataLayout &DL = M.getDataLayout();
7745 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7746 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7747 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7748 Elem.second = GV;
7749 }
7750
7751 return Elem.second;
7752}
7753
7754Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7755 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7756 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7757 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7758}
7759
7762 Value *Null =
7763 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7764 Value *SizeGep =
7765 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7766 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7767 return SizePtrToInt;
7768}
7769
7772 std::string VarName) {
7773 llvm::Constant *MaptypesArrayInit =
7775 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7776 M, MaptypesArrayInit->getType(),
7777 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7778 VarName);
7779 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7780 return MaptypesArrayGlobal;
7781}
7782
7784 InsertPointTy AllocaIP,
7785 unsigned NumOperands,
7786 struct MapperAllocas &MapperAllocas) {
7787 if (!updateToLocation(Loc))
7788 return;
7789
7790 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7791 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7792 Builder.restoreIP(AllocaIP);
7793 AllocaInst *ArgsBase = Builder.CreateAlloca(
7794 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7795 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7796 ".offload_ptrs");
7797 AllocaInst *ArgSizes = Builder.CreateAlloca(
7798 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7799 Builder.restoreIP(Loc.IP);
7800 MapperAllocas.ArgsBase = ArgsBase;
7801 MapperAllocas.Args = Args;
7802 MapperAllocas.ArgSizes = ArgSizes;
7803}
7804
7806 Function *MapperFunc, Value *SrcLocInfo,
7807 Value *MaptypesArg, Value *MapnamesArg,
7809 int64_t DeviceID, unsigned NumOperands) {
7810 if (!updateToLocation(Loc))
7811 return;
7812
7813 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7814 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7815 Value *ArgsBaseGEP =
7817 {Builder.getInt32(0), Builder.getInt32(0)});
7818 Value *ArgsGEP =
7820 {Builder.getInt32(0), Builder.getInt32(0)});
7821 Value *ArgSizesGEP =
7823 {Builder.getInt32(0), Builder.getInt32(0)});
7824 Value *NullPtr =
7825 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7826 Builder.CreateCall(MapperFunc,
7827 {SrcLocInfo, Builder.getInt64(DeviceID),
7828 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7829 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7830}
7831
7833 TargetDataRTArgs &RTArgs,
7834 TargetDataInfo &Info,
7835 bool ForEndCall) {
7836 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7837 "expected region end call to runtime only when end call is separate");
7838 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7839 auto VoidPtrTy = UnqualPtrTy;
7840 auto VoidPtrPtrTy = UnqualPtrTy;
7841 auto Int64Ty = Type::getInt64Ty(M.getContext());
7842 auto Int64PtrTy = UnqualPtrTy;
7843
7844 if (!Info.NumberOfPtrs) {
7845 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7846 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7847 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7848 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7849 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7850 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7851 return;
7852 }
7853
7855 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7856 Info.RTArgs.BasePointersArray,
7857 /*Idx0=*/0, /*Idx1=*/0);
7859 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7860 /*Idx0=*/0,
7861 /*Idx1=*/0);
7863 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7864 /*Idx0=*/0, /*Idx1=*/0);
7866 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7867 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7868 : Info.RTArgs.MapTypesArray,
7869 /*Idx0=*/0,
7870 /*Idx1=*/0);
7871
7872 // Only emit the mapper information arrays if debug information is
7873 // requested.
7874 if (!Info.EmitDebug)
7875 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7876 else
7878 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7879 /*Idx0=*/0,
7880 /*Idx1=*/0);
7881 // If there is no user-defined mapper, set the mapper array to nullptr to
7882 // avoid an unnecessary data privatization
7883 if (!Info.HasMapper)
7884 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7885 else
7886 RTArgs.MappersArray =
7887 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7888}
7889
7891 InsertPointTy CodeGenIP,
7892 MapInfosTy &CombinedInfo,
7893 TargetDataInfo &Info) {
7895 CombinedInfo.NonContigInfo;
7896
7897 // Build an array of struct descriptor_dim and then assign it to
7898 // offload_args.
7899 //
7900 // struct descriptor_dim {
7901 // uint64_t offset;
7902 // uint64_t count;
7903 // uint64_t stride
7904 // };
7905 Type *Int64Ty = Builder.getInt64Ty();
7907 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7908 "struct.descriptor_dim");
7909
7910 enum { OffsetFD = 0, CountFD, StrideFD };
7911 // We need two index variable here since the size of "Dims" is the same as
7912 // the size of Components, however, the size of offset, count, and stride is
7913 // equal to the size of base declaration that is non-contiguous.
7914 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7915 // Skip emitting ir if dimension size is 1 since it cannot be
7916 // non-contiguous.
7917 if (NonContigInfo.Dims[I] == 1)
7918 continue;
7919 Builder.restoreIP(AllocaIP);
7920 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7921 AllocaInst *DimsAddr =
7922 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7923 Builder.restoreIP(CodeGenIP);
7924 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7925 unsigned RevIdx = EE - II - 1;
7926 Value *DimsLVal = Builder.CreateInBoundsGEP(
7927 DimsAddr->getAllocatedType(), DimsAddr,
7928 {Builder.getInt64(0), Builder.getInt64(II)});
7929 // Offset
7930 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7932 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7933 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7934 // Count
7935 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7937 NonContigInfo.Counts[L][RevIdx], CountLVal,
7938 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7939 // Stride
7940 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7942 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7943 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7944 }
7945 // args[I] = &dims
7946 Builder.restoreIP(CodeGenIP);
7948 DimsAddr, Builder.getPtrTy());
7950 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7951 Info.RTArgs.PointersArray, 0, I);
7954 ++L;
7955 }
7956}
7957
7958void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7959 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7960 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7961 BasicBlock *ExitBB, bool IsInit) {
7962 StringRef Prefix = IsInit ? ".init" : ".del";
7963
7964 // Evaluate if this is an array section.
7966 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7967 Value *IsArray =
7968 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7969 Value *DeleteBit = Builder.CreateAnd(
7970 MapType,
7972 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7973 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7974 Value *DeleteCond;
7975 Value *Cond;
7976 if (IsInit) {
7977 // base != begin?
7978 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7979 // IsPtrAndObj?
7980 Value *PtrAndObjBit = Builder.CreateAnd(
7981 MapType,
7983 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7984 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7985 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7986 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7987 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7988 DeleteCond = Builder.CreateIsNull(
7989 DeleteBit,
7990 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7991 } else {
7992 Cond = IsArray;
7993 DeleteCond = Builder.CreateIsNotNull(
7994 DeleteBit,
7995 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7996 }
7997 Cond = Builder.CreateAnd(Cond, DeleteCond);
7998 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7999
8000 emitBlock(BodyBB, MapperFn);
8001 // Get the array size by multiplying element size and element number (i.e., \p
8002 // Size).
8003 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8004 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8005 // memory allocation/deletion purpose only.
8006 Value *MapTypeArg = Builder.CreateAnd(
8007 MapType,
8009 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8010 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8011 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8012 MapTypeArg = Builder.CreateOr(
8013 MapTypeArg,
8015 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8016 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8017
8018 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8019 // data structure.
8020 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8021 ArraySize, MapTypeArg, MapName};
8023 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8024 OffloadingArgs);
8025}
8026
8028 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8029 llvm::Value *BeginArg)>
8030 GenMapInfoCB,
8031 Type *ElemTy, StringRef FuncName,
8032 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
8033 SmallVector<Type *> Params;
8034 Params.emplace_back(Builder.getPtrTy());
8035 Params.emplace_back(Builder.getPtrTy());
8036 Params.emplace_back(Builder.getPtrTy());
8039 Params.emplace_back(Builder.getPtrTy());
8040
8041 auto *FnTy =
8042 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8043
8044 SmallString<64> TyStr;
8045 raw_svector_ostream Out(TyStr);
8046 Function *MapperFn =
8048 MapperFn->addFnAttr(Attribute::NoInline);
8049 MapperFn->addFnAttr(Attribute::NoUnwind);
8050 MapperFn->addParamAttr(0, Attribute::NoUndef);
8051 MapperFn->addParamAttr(1, Attribute::NoUndef);
8052 MapperFn->addParamAttr(2, Attribute::NoUndef);
8053 MapperFn->addParamAttr(3, Attribute::NoUndef);
8054 MapperFn->addParamAttr(4, Attribute::NoUndef);
8055 MapperFn->addParamAttr(5, Attribute::NoUndef);
8056
8057 // Start the mapper function code generation.
8058 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8059 auto SavedIP = Builder.saveIP();
8060 Builder.SetInsertPoint(EntryBB);
8061
8062 Value *MapperHandle = MapperFn->getArg(0);
8063 Value *BaseIn = MapperFn->getArg(1);
8064 Value *BeginIn = MapperFn->getArg(2);
8065 Value *Size = MapperFn->getArg(3);
8066 Value *MapType = MapperFn->getArg(4);
8067 Value *MapName = MapperFn->getArg(5);
8068
8069 // Compute the starting and end addresses of array elements.
8070 // Prepare common arguments for array initiation and deletion.
8071 // Convert the size in bytes into the number of array elements.
8072 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8074 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
8075 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8076
8077 // Emit array initiation if this is an array section and \p MapType indicates
8078 // that memory allocation is required.
8079 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8080 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8081 MapType, MapName, ElementSize, HeadBB,
8082 /*IsInit=*/true);
8083
8084 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8085
8086 // Emit the loop header block.
8087 emitBlock(HeadBB, MapperFn);
8088 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8089 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8090 // Evaluate whether the initial condition is satisfied.
8091 Value *IsEmpty =
8092 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8093 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8094
8095 // Emit the loop body block.
8096 emitBlock(BodyBB, MapperFn);
8097 BasicBlock *LastBB = BodyBB;
8098 PHINode *PtrPHI =
8099 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8100 PtrPHI->addIncoming(PtrBegin, HeadBB);
8101
8102 // Get map clause information. Fill up the arrays with all mapped variables.
8103 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8104
8105 // Call the runtime API __tgt_mapper_num_components to get the number of
8106 // pre-existing components.
8107 Value *OffloadingArgs[] = {MapperHandle};
8108 Value *PreviousSize = Builder.CreateCall(
8109 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8110 OffloadingArgs);
8111 Value *ShiftedPreviousSize =
8113
8114 // Fill up the runtime mapper handle for all components.
8115 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
8116 Value *CurBaseArg =
8117 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
8118 Value *CurBeginArg =
8120 Value *CurSizeArg = Info.Sizes[I];
8121 Value *CurNameArg = Info.Names.size()
8122 ? Info.Names[I]
8124
8125 // Extract the MEMBER_OF field from the map type.
8126 Value *OriMapType = Builder.getInt64(
8127 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8128 Info.Types[I]));
8129 Value *MemberMapType =
8130 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8131
8132 // Combine the map type inherited from user-defined mapper with that
8133 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8134 // bits of the \a MapType, which is the input argument of the mapper
8135 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8136 // bits of MemberMapType.
8137 // [OpenMP 5.0], 1.2.6. map-type decay.
8138 // | alloc | to | from | tofrom | release | delete
8139 // ----------------------------------------------------------
8140 // alloc | alloc | alloc | alloc | alloc | release | delete
8141 // to | alloc | to | alloc | to | release | delete
8142 // from | alloc | alloc | from | from | release | delete
8143 // tofrom | alloc | to | from | tofrom | release | delete
8144 Value *LeftToFrom = Builder.CreateAnd(
8145 MapType,
8147 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8148 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8149 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8150 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8151 BasicBlock *AllocElseBB =
8152 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8153 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8154 BasicBlock *ToElseBB =
8155 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8156 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8157 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8158 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8159 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8160 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8161 emitBlock(AllocBB, MapperFn);
8162 Value *AllocMapType = Builder.CreateAnd(
8163 MemberMapType,
8165 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8166 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8167 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8168 Builder.CreateBr(EndBB);
8169 emitBlock(AllocElseBB, MapperFn);
8170 Value *IsTo = Builder.CreateICmpEQ(
8171 LeftToFrom,
8173 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8174 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8175 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8176 // In case of to, clear OMP_MAP_FROM.
8177 emitBlock(ToBB, MapperFn);
8178 Value *ToMapType = Builder.CreateAnd(
8179 MemberMapType,
8181 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8182 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8183 Builder.CreateBr(EndBB);
8184 emitBlock(ToElseBB, MapperFn);
8185 Value *IsFrom = Builder.CreateICmpEQ(
8186 LeftToFrom,
8188 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8189 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8190 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8191 // In case of from, clear OMP_MAP_TO.
8192 emitBlock(FromBB, MapperFn);
8193 Value *FromMapType = Builder.CreateAnd(
8194 MemberMapType,
8196 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8197 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8198 // In case of tofrom, do nothing.
8199 emitBlock(EndBB, MapperFn);
8200 LastBB = EndBB;
8201 PHINode *CurMapType =
8202 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8203 CurMapType->addIncoming(AllocMapType, AllocBB);
8204 CurMapType->addIncoming(ToMapType, ToBB);
8205 CurMapType->addIncoming(FromMapType, FromBB);
8206 CurMapType->addIncoming(MemberMapType, ToElseBB);
8207
8208 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8209 CurSizeArg, CurMapType, CurNameArg};
8210 Function *ChildMapperFn = nullptr;
8211 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
8212 // Call the corresponding mapper function.
8213 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8214 } else {
8215 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8216 // data structure.
8218 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8219 OffloadingArgs);
8220 }
8221 }
8222
8223 // Update the pointer to point to the next element that needs to be mapped,
8224 // and check whether we have mapped all elements.
8225 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8226 "omp.arraymap.next");
8227 PtrPHI->addIncoming(PtrNext, LastBB);
8228 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8229 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8230 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8231
8232 emitBlock(ExitBB, MapperFn);
8233 // Emit array deletion if this is an array section and \p MapType indicates
8234 // that deletion is required.
8235 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8236 MapType, MapName, ElementSize, DoneBB,
8237 /*IsInit=*/false);
8238
8239 // Emit the function exit block.
8240 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8241
8243 Builder.restoreIP(SavedIP);
8244 return MapperFn;
8245}
8246
8248 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8249 TargetDataInfo &Info, bool IsNonContiguous,
8250 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8251 function_ref<Value *(unsigned int)> CustomMapperCB) {
8252
8253 // Reset the array information.
8254 Info.clearArrayInfo();
8255 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8256
8257 if (Info.NumberOfPtrs == 0)
8258 return;
8259
8260 Builder.restoreIP(AllocaIP);
8261 // Detect if we have any capture size requiring runtime evaluation of the
8262 // size so that a constant array could be eventually used.
8263 ArrayType *PointerArrayType =
8264 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8265
8266 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8267 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8268
8269 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8270 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8271 AllocaInst *MappersArray = Builder.CreateAlloca(
8272 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8273 Info.RTArgs.MappersArray = MappersArray;
8274
8275 // If we don't have any VLA types or other types that require runtime
8276 // evaluation, we can use a constant array for the map sizes, otherwise we
8277 // need to fill up the arrays as we do for the pointers.
8278 Type *Int64Ty = Builder.getInt64Ty();
8279 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8280 ConstantInt::get(Int64Ty, 0));
8281 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8282 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8283 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8284 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8285 if (IsNonContiguous &&
8286 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8287 CombinedInfo.Types[I] &
8288 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8289 ConstSizes[I] =
8290 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8291 else
8292 ConstSizes[I] = CI;
8293 continue;
8294 }
8295 }
8296 RuntimeSizes.set(I);
8297 }
8298
8299 if (RuntimeSizes.all()) {
8300 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8301 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8302 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8303 Builder.restoreIP(CodeGenIP);
8304 } else {
8305 auto *SizesArrayInit = ConstantArray::get(
8306 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8307 std::string Name = createPlatformSpecificName({"offload_sizes"});
8308 auto *SizesArrayGbl =
8309 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8310 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8311 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8312
8313 if (!RuntimeSizes.any()) {
8314 Info.RTArgs.SizesArray = SizesArrayGbl;
8315 } else {
8316 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8317 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8318 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8320 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8321 Buffer->setAlignment(OffloadSizeAlign);
8322 Builder.restoreIP(CodeGenIP);
8324 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8325 SizesArrayGbl, OffloadSizeAlign,
8327 IndexSize,
8328 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8329
8330 Info.RTArgs.SizesArray = Buffer;
8331 }
8332 Builder.restoreIP(CodeGenIP);
8333 }
8334
8335 // The map types are always constant so we don't need to generate code to
8336 // fill arrays. Instead, we create an array constant.
8338 for (auto mapFlag : CombinedInfo.Types)
8339 Mapping.push_back(
8340 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8341 mapFlag));
8342 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8343 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8344 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8345
8346 // The information types are only built if provided.
8347 if (!CombinedInfo.Names.empty()) {
8348 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8349 auto *MapNamesArrayGbl =
8350 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8351 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8352 Info.EmitDebug = true;
8353 } else {
8354 Info.RTArgs.MapNamesArray =
8356 Info.EmitDebug = false;
8357 }
8358
8359 // If there's a present map type modifier, it must not be applied to the end
8360 // of a region, so generate a separate map type array in that case.
8361 if (Info.separateBeginEndCalls()) {
8362 bool EndMapTypesDiffer = false;
8363 for (uint64_t &Type : Mapping) {
8364 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8365 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8366 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8367 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8368 EndMapTypesDiffer = true;
8369 }
8370 }
8371 if (EndMapTypesDiffer) {
8372 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8373 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8374 }
8375 }
8376
8377 PointerType *PtrTy = Builder.getPtrTy();
8378 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8379 Value *BPVal = CombinedInfo.BasePointers[I];
8381 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8382 0, I);
8383 Builder.CreateAlignedStore(BPVal, BP,
8385
8386 if (Info.requiresDevicePointerInfo()) {
8387 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8388 CodeGenIP = Builder.saveIP();
8389 Builder.restoreIP(AllocaIP);
8390 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8391 Builder.restoreIP(CodeGenIP);
8392 if (DeviceAddrCB)
8393 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8394 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8395 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8396 if (DeviceAddrCB)
8397 DeviceAddrCB(I, BP);
8398 }
8399 }
8400
8401 Value *PVal = CombinedInfo.Pointers[I];
8403 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8404 I);
8405 // TODO: Check alignment correct.
8408
8409 if (RuntimeSizes.test(I)) {
8411 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8412 /*Idx0=*/0,
8413 /*Idx1=*/I);
8415 Int64Ty,
8416 /*isSigned=*/true),
8417 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8418 }
8419 // Fill up the mapper array.
8420 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8421 Value *MFunc = ConstantPointerNull::get(PtrTy);
8422 if (CustomMapperCB)
8423 if (Value *CustomMFunc = CustomMapperCB(I))
8424 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8426 MappersArray->getAllocatedType(), MappersArray,
8427 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8429 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8430 }
8431
8432 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8433 Info.NumberOfPtrs == 0)
8434 return;
8435 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8436}
8437
8440
8441 if (!CurBB || CurBB->getTerminator()) {
8442 // If there is no insert point or the previous block is already
8443 // terminated, don't touch it.
8444 } else {
8445 // Otherwise, create a fall-through branch.
8447 }
8448
8450}
8451
8453 bool IsFinished) {
8455
8456 // Fall out of the current block (if necessary).
8457 emitBranch(BB);
8458
8459 if (IsFinished && BB->use_empty()) {
8460 BB->eraseFromParent();
8461 return;
8462 }
8463
8464 // Place the block after the current block, if possible, or else at
8465 // the end of the function.
8466 if (CurBB && CurBB->getParent())
8467 CurFn->insert(std::next(CurBB->getIterator()), BB);
8468 else
8469 CurFn->insert(CurFn->end(), BB);
8471}
8472
8474 BodyGenCallbackTy ElseGen,
8475 InsertPointTy AllocaIP) {
8476 // If the condition constant folds and can be elided, try to avoid emitting
8477 // the condition and the dead arm of the if/else.
8478 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8479 auto CondConstant = CI->getSExtValue();
8480 if (CondConstant)
8481 return ThenGen(AllocaIP, Builder.saveIP());
8482
8483 return ElseGen(AllocaIP, Builder.saveIP());
8484 }
8485
8487
8488 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8489 // emit the conditional branch.
8490 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8491 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8492 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8493 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8494 // Emit the 'then' code.
8495 emitBlock(ThenBlock, CurFn);
8496 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8497 return Err;
8498 emitBranch(ContBlock);
8499 // Emit the 'else' code if present.
8500 // There is no need to emit line number for unconditional branch.
8501 emitBlock(ElseBlock, CurFn);
8502 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8503 return Err;
8504 // There is no need to emit line number for unconditional branch.
8505 emitBranch(ContBlock);
8506 // Emit the continuation block for code after the if.
8507 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8508 return Error::success();
8509}
8510
8511bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8512 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8515 "Unexpected Atomic Ordering.");
8516
8517 bool Flush = false;
8519
8520 switch (AK) {
8521 case Read:
8524 FlushAO = AtomicOrdering::Acquire;
8525 Flush = true;
8526 }
8527 break;
8528 case Write:
8529 case Compare:
8530 case Update:
8533 FlushAO = AtomicOrdering::Release;
8534 Flush = true;
8535 }
8536 break;
8537 case Capture:
8538 switch (AO) {
8540 FlushAO = AtomicOrdering::Acquire;
8541 Flush = true;
8542 break;
8544 FlushAO = AtomicOrdering::Release;
8545 Flush = true;
8546 break;
8550 Flush = true;
8551 break;
8552 default:
8553 // do nothing - leave silently.
8554 break;
8555 }
8556 }
8557
8558 if (Flush) {
8559 // Currently Flush RT call still doesn't take memory_ordering, so for when
8560 // that happens, this tries to do the resolution of which atomic ordering
8561 // to use with but issue the flush call
8562 // TODO: pass `FlushAO` after memory ordering support is added
8563 (void)FlushAO;
8564 emitFlush(Loc);
8565 }
8566
8567 // for AO == AtomicOrdering::Monotonic and all other case combinations
8568 // do nothing
8569 return Flush;
8570}
8571
8575 AtomicOrdering AO) {
8576 if (!updateToLocation(Loc))
8577 return Loc.IP;
8578
8579 assert(X.Var->getType()->isPointerTy() &&
8580 "OMP Atomic expects a pointer to target memory");
8581 Type *XElemTy = X.ElemTy;
8582 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8583 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8584 "OMP atomic read expected a scalar type");
8585
8586 Value *XRead = nullptr;
8587
8588 if (XElemTy->isIntegerTy()) {
8589 LoadInst *XLD =
8590 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8591 XLD->setAtomic(AO);
8592 XRead = cast<Value>(XLD);
8593 } else if (XElemTy->isStructTy()) {
8594 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8595 // target does not support `atomicrmw` of the size of the struct
8596 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8597 OldVal->setAtomic(AO);
8598 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8599 unsigned LoadSize =
8600 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8601 OpenMPIRBuilder::AtomicInfo atomicInfo(
8602 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8603 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8604 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8605 XRead = AtomicLoadRes.first;
8606 OldVal->eraseFromParent();
8607 } else {
8608 // We need to perform atomic op as integer
8609 IntegerType *IntCastTy =
8611 LoadInst *XLoad =
8612 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8613 XLoad->setAtomic(AO);
8614 if (XElemTy->isFloatingPointTy()) {
8615 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8616 } else {
8617 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8618 }
8619 }
8620 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8621 if (XRead->getType() != V.Var->getType())
8622 XRead = emitImplicitCast(Builder, XRead, V.Var);
8623 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8624 return Builder.saveIP();
8625}
8626
8629 AtomicOpValue &X, Value *Expr,
8630 AtomicOrdering AO) {
8631 if (!updateToLocation(Loc))
8632 return Loc.IP;
8633
8634 assert(X.Var->getType()->isPointerTy() &&
8635 "OMP Atomic expects a pointer to target memory");
8636 Type *XElemTy = X.ElemTy;
8637 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8638 XElemTy->isPointerTy()) &&
8639 "OMP atomic write expected a scalar type");
8640
8641 if (XElemTy->isIntegerTy()) {
8642 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8643 XSt->setAtomic(AO);
8644 } else {
8645 // We need to bitcast and perform atomic op as integers
8646 IntegerType *IntCastTy =
8648 Value *ExprCast =
8649 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8650 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8651 XSt->setAtomic(AO);
8652 }
8653
8654 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8655 return Builder.saveIP();
8656}
8657
8659 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8660 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8661 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8662 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8663 if (!updateToLocation(Loc))
8664 return Loc.IP;
8665
8666 LLVM_DEBUG({
8667 Type *XTy = X.Var->getType();
8668 assert(XTy->isPointerTy() &&
8669 "OMP Atomic expects a pointer to target memory");
8670 Type *XElemTy = X.ElemTy;
8671 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8672 XElemTy->isPointerTy()) &&
8673 "OMP atomic update expected a scalar type");
8674 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8675 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8676 "OpenMP atomic does not support LT or GT operations");
8677 });
8678
8680 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8681 X.IsVolatile, IsXBinopExpr);
8682 if (!AtomicResult)
8683 return AtomicResult.takeError();
8684 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8685 return Builder.saveIP();
8686}
8687
8688// FIXME: Duplicating AtomicExpand
8689Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8690 AtomicRMWInst::BinOp RMWOp) {
8691 switch (RMWOp) {
8692 case AtomicRMWInst::Add:
8693 return Builder.CreateAdd(Src1, Src2);
8694 case AtomicRMWInst::Sub:
8695 return Builder.CreateSub(Src1, Src2);
8696 case AtomicRMWInst::And:
8697 return Builder.CreateAnd(Src1, Src2);
8699 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8700 case AtomicRMWInst::Or:
8701 return Builder.CreateOr(Src1, Src2);
8702 case AtomicRMWInst::Xor:
8703 return Builder.CreateXor(Src1, Src2);
8708 case AtomicRMWInst::Max:
8709 case AtomicRMWInst::Min:
8718 llvm_unreachable("Unsupported atomic update operation");
8719 }
8720 llvm_unreachable("Unsupported atomic update operation");
8721}
8722
8723Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8724 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8726 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8727 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8728 // or a complex datatype.
8729 bool emitRMWOp = false;
8730 switch (RMWOp) {
8731 case AtomicRMWInst::Add:
8732 case AtomicRMWInst::And:
8734 case AtomicRMWInst::Or:
8735 case AtomicRMWInst::Xor:
8737 emitRMWOp = XElemTy;
8738 break;
8739 case AtomicRMWInst::Sub:
8740 emitRMWOp = (IsXBinopExpr && XElemTy);
8741 break;
8742 default:
8743 emitRMWOp = false;
8744 }
8745 emitRMWOp &= XElemTy->isIntegerTy();
8746
8747 std::pair<Value *, Value *> Res;
8748 if (emitRMWOp) {
8749 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8750 // not needed except in case of postfix captures. Generate anyway for
8751 // consistency with the else part. Will be removed with any DCE pass.
8752 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8753 if (RMWOp == AtomicRMWInst::Xchg)
8754 Res.second = Res.first;
8755 else
8756 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8757 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8758 XElemTy->isStructTy()) {
8759 LoadInst *OldVal =
8760 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8761 OldVal->setAtomic(AO);
8762 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8763 unsigned LoadSize =
8764 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8765
8766 OpenMPIRBuilder::AtomicInfo atomicInfo(
8767 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8768 OldVal->getAlign(), true /* UseLibcall */, X);
8769 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8771 Instruction *CurBBTI = CurBB->getTerminator();
8772 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8773 BasicBlock *ExitBB =
8774 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8775 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8776 X->getName() + ".atomic.cont");
8777 ContBB->getTerminator()->eraseFromParent();
8778 Builder.restoreIP(AllocaIP);
8779 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8780 NewAtomicAddr->setName(X->getName() + "x.new.val");
8781 Builder.SetInsertPoint(ContBB);
8782 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8783 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8784 Value *OldExprVal = PHI;
8785 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8786 if (!CBResult)
8787 return CBResult.takeError();
8788 Value *Upd = *CBResult;
8789 Builder.CreateStore(Upd, NewAtomicAddr);
8792 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8793 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8794 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8795 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8796 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8797 OldVal->eraseFromParent();
8798 Res.first = OldExprVal;
8799 Res.second = Upd;
8800
8801 if (UnreachableInst *ExitTI =
8802 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8803 CurBBTI->eraseFromParent();
8804 Builder.SetInsertPoint(ExitBB);
8805 } else {
8806 Builder.SetInsertPoint(ExitTI);
8807 }
8808 } else {
8809 IntegerType *IntCastTy =
8811 LoadInst *OldVal =
8812 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8813 OldVal->setAtomic(AO);
8814 // CurBB
8815 // | /---\
8816 // ContBB |
8817 // | \---/
8818 // ExitBB
8820 Instruction *CurBBTI = CurBB->getTerminator();
8821 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8822 BasicBlock *ExitBB =
8823 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8824 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8825 X->getName() + ".atomic.cont");
8826 ContBB->getTerminator()->eraseFromParent();
8827 Builder.restoreIP(AllocaIP);
8828 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8829 NewAtomicAddr->setName(X->getName() + "x.new.val");
8830 Builder.SetInsertPoint(ContBB);
8831 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8832 PHI->addIncoming(OldVal, CurBB);
8833 bool IsIntTy = XElemTy->isIntegerTy();
8834 Value *OldExprVal = PHI;
8835 if (!IsIntTy) {
8836 if (XElemTy->isFloatingPointTy()) {
8837 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8838 X->getName() + ".atomic.fltCast");
8839 } else {
8840 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8841 X->getName() + ".atomic.ptrCast");
8842 }
8843 }
8844
8845 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8846 if (!CBResult)
8847 return CBResult.takeError();
8848 Value *Upd = *CBResult;
8849 Builder.CreateStore(Upd, NewAtomicAddr);
8850 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8854 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8855 Result->setVolatile(VolatileX);
8856 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8857 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8858 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8859 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8860
8861 Res.first = OldExprVal;
8862 Res.second = Upd;
8863
8864 // set Insertion point in exit block
8865 if (UnreachableInst *ExitTI =
8866 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8867 CurBBTI->eraseFromParent();
8868 Builder.SetInsertPoint(ExitBB);
8869 } else {
8870 Builder.SetInsertPoint(ExitTI);
8871 }
8872 }
8873
8874 return Res;
8875}
8876
8878 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8879 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8881 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8882 if (!updateToLocation(Loc))
8883 return Loc.IP;
8884
8885 LLVM_DEBUG({
8886 Type *XTy = X.Var->getType();
8887 assert(XTy->isPointerTy() &&
8888 "OMP Atomic expects a pointer to target memory");
8889 Type *XElemTy = X.ElemTy;
8890 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8891 XElemTy->isPointerTy()) &&
8892 "OMP atomic capture expected a scalar type");
8893 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8894 "OpenMP atomic does not support LT or GT operations");
8895 });
8896
8897 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8898 // 'x' is simply atomically rewritten with 'expr'.
8899 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8901 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8902 X.IsVolatile, IsXBinopExpr);
8903 if (!AtomicResult)
8904 return AtomicResult.takeError();
8905 Value *CapturedVal =
8906 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8907 if (CapturedVal->getType() != V.Var->getType())
8908 CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
8909 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8910
8911 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8912 return Builder.saveIP();
8913}
8914
8918 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8919 bool IsFailOnly) {
8920
8922 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8923 IsPostfixUpdate, IsFailOnly, Failure);
8924}
8925
8929 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8930 bool IsFailOnly, AtomicOrdering Failure) {
8931
8932 if (!updateToLocation(Loc))
8933 return Loc.IP;
8934
8935 assert(X.Var->getType()->isPointerTy() &&
8936 "OMP atomic expects a pointer to target memory");
8937 // compare capture
8938 if (V.Var) {
8939 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8940 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8941 }
8942
8943 bool IsInteger = E->getType()->isIntegerTy();
8944
8945 if (Op == OMPAtomicCompareOp::EQ) {
8946 AtomicCmpXchgInst *Result = nullptr;
8947 if (!IsInteger) {
8948 IntegerType *IntCastTy =
8949 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8950 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8951 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8952 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8953 AO, Failure);
8954 } else {
8955 Result =
8956 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8957 }
8958
8959 if (V.Var) {
8960 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8961 if (!IsInteger)
8962 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8963 assert(OldValue->getType() == V.ElemTy &&
8964 "OldValue and V must be of same type");
8965 if (IsPostfixUpdate) {
8966 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8967 } else {
8968 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8969 if (IsFailOnly) {
8970 // CurBB----
8971 // | |
8972 // v |
8973 // ContBB |
8974 // | |
8975 // v |
8976 // ExitBB <-
8977 //
8978 // where ContBB only contains the store of old value to 'v'.
8980 Instruction *CurBBTI = CurBB->getTerminator();
8981 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8982 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8983 CurBBTI, X.Var->getName() + ".atomic.exit");
8984 BasicBlock *ContBB = CurBB->splitBasicBlock(
8985 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8986 ContBB->getTerminator()->eraseFromParent();
8987 CurBB->getTerminator()->eraseFromParent();
8988
8989 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8990
8991 Builder.SetInsertPoint(ContBB);
8992 Builder.CreateStore(OldValue, V.Var);
8993 Builder.CreateBr(ExitBB);
8994
8995 if (UnreachableInst *ExitTI =
8996 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8997 CurBBTI->eraseFromParent();
8998 Builder.SetInsertPoint(ExitBB);
8999 } else {
9000 Builder.SetInsertPoint(ExitTI);
9001 }
9002 } else {
9003 Value *CapturedValue =
9004 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9005 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9006 }
9007 }
9008 }
9009 // The comparison result has to be stored.
9010 if (R.Var) {
9011 assert(R.Var->getType()->isPointerTy() &&
9012 "r.var must be of pointer type");
9013 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9014
9015 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9016 Value *ResultCast = R.IsSigned
9017 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9018 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9019 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9020 }
9021 } else {
9022 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9023 "Op should be either max or min at this point");
9024 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9025
9026 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9027 // Let's take max as example.
9028 // OpenMP form:
9029 // x = x > expr ? expr : x;
9030 // LLVM form:
9031 // *ptr = *ptr > val ? *ptr : val;
9032 // We need to transform to LLVM form.
9033 // x = x <= expr ? x : expr;
9035 if (IsXBinopExpr) {
9036 if (IsInteger) {
9037 if (X.IsSigned)
9038 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9040 else
9041 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9043 } else {
9044 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9046 }
9047 } else {
9048 if (IsInteger) {
9049 if (X.IsSigned)
9050 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9052 else
9053 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9055 } else {
9056 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9058 }
9059 }
9060
9061 AtomicRMWInst *OldValue =
9062 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9063 if (V.Var) {
9064 Value *CapturedValue = nullptr;
9065 if (IsPostfixUpdate) {
9066 CapturedValue = OldValue;
9067 } else {
9068 CmpInst::Predicate Pred;
9069 switch (NewOp) {
9070 case AtomicRMWInst::Max:
9071 Pred = CmpInst::ICMP_SGT;
9072 break;
9074 Pred = CmpInst::ICMP_UGT;
9075 break;
9077 Pred = CmpInst::FCMP_OGT;
9078 break;
9079 case AtomicRMWInst::Min:
9080 Pred = CmpInst::ICMP_SLT;
9081 break;
9083 Pred = CmpInst::ICMP_ULT;
9084 break;
9086 Pred = CmpInst::FCMP_OLT;
9087 break;
9088 default:
9089 llvm_unreachable("unexpected comparison op");
9090 }
9091 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9092 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9093 }
9094 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9095 }
9096 }
9097
9098 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9099
9100 return Builder.saveIP();
9101}
9102
9105 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9106 Value *NumTeamsUpper, Value *ThreadLimit,
9107 Value *IfExpr) {
9108 if (!updateToLocation(Loc))
9109 return InsertPointTy();
9110
9111 uint32_t SrcLocStrSize;
9112 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9113 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9114 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9115
9116 // Outer allocation basicblock is the entry block of the current function.
9117 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9118 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9119 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9120 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9121 }
9122
9123 // The current basic block is split into four basic blocks. After outlining,
9124 // they will be mapped as follows:
9125 // ```
9126 // def current_fn() {
9127 // current_basic_block:
9128 // br label %teams.exit
9129 // teams.exit:
9130 // ; instructions after teams
9131 // }
9132 //
9133 // def outlined_fn() {
9134 // teams.alloca:
9135 // br label %teams.body
9136 // teams.body:
9137 // ; instructions within teams body
9138 // }
9139 // ```
9140 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9141 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9142 BasicBlock *AllocaBB =
9143 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9144
9145 bool SubClausesPresent =
9146 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9147 // Push num_teams
9148 if (!Config.isTargetDevice() && SubClausesPresent) {
9149 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9150 "if lowerbound is non-null, then upperbound must also be non-null "
9151 "for bounds on num_teams");
9152
9153 if (NumTeamsUpper == nullptr)
9154 NumTeamsUpper = Builder.getInt32(0);
9155
9156 if (NumTeamsLower == nullptr)
9157 NumTeamsLower = NumTeamsUpper;
9158
9159 if (IfExpr) {
9160 assert(IfExpr->getType()->isIntegerTy() &&
9161 "argument to if clause must be an integer value");
9162
9163 // upper = ifexpr ? upper : 1
9164 if (IfExpr->getType() != Int1)
9165 IfExpr = Builder.CreateICmpNE(IfExpr,
9166 ConstantInt::get(IfExpr->getType(), 0));
9167 NumTeamsUpper = Builder.CreateSelect(
9168 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9169
9170 // lower = ifexpr ? lower : 1
9171 NumTeamsLower = Builder.CreateSelect(
9172 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9173 }
9174
9175 if (ThreadLimit == nullptr)
9176 ThreadLimit = Builder.getInt32(0);
9177
9178 Value *ThreadNum = getOrCreateThreadID(Ident);
9180 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9181 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9182 }
9183 // Generate the body of teams.
9184 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9185 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9186 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9187 return Err;
9188
9189 OutlineInfo OI;
9190 OI.EntryBB = AllocaBB;
9191 OI.ExitBB = ExitBB;
9192 OI.OuterAllocaBB = &OuterAllocaBB;
9193
9194 // Insert fake values for global tid and bound tid.
9196 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9198 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9200 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9201
9202 auto HostPostOutlineCB = [this, Ident,
9203 ToBeDeleted](Function &OutlinedFn) mutable {
9204 // The stale call instruction will be replaced with a new call instruction
9205 // for runtime call with the outlined function.
9206
9207 assert(OutlinedFn.getNumUses() == 1 &&
9208 "there must be a single user for the outlined function");
9209 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9210 ToBeDeleted.push_back(StaleCI);
9211
9212 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9213 "Outlined function must have two or three arguments only");
9214
9215 bool HasShared = OutlinedFn.arg_size() == 3;
9216
9217 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9218 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9219 if (HasShared)
9220 OutlinedFn.getArg(2)->setName("data");
9221
9222 // Call to the runtime function for teams in the current function.
9223 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9224 "outlined function.");
9225 Builder.SetInsertPoint(StaleCI);
9226 SmallVector<Value *> Args = {
9227 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9228 if (HasShared)
9229 Args.push_back(StaleCI->getArgOperand(2));
9231 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9232 Args);
9233
9234 for (Instruction *I : llvm::reverse(ToBeDeleted))
9235 I->eraseFromParent();
9236 };
9237
9238 if (!Config.isTargetDevice())
9239 OI.PostOutlineCB = HostPostOutlineCB;
9240
9241 addOutlineInfo(std::move(OI));
9242
9243 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9244
9245 return Builder.saveIP();
9246}
9247
9250 std::string VarName) {
9251 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9253 Names.size()),
9254 Names);
9255 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9256 M, MapNamesArrayInit->getType(),
9257 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9258 VarName);
9259 return MapNamesArrayGlobal;
9260}
9261
9262// Create all simple and struct types exposed by the runtime and remember
9263// the llvm::PointerTypes of them for easy access later.
9264void OpenMPIRBuilder::initializeTypes(Module &M) {
9265 LLVMContext &Ctx = M.getContext();
9266 StructType *T;
9267#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9268#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9269 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9270 VarName##PtrTy = PointerType::getUnqual(Ctx);
9271#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9272 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9273 VarName##Ptr = PointerType::getUnqual(Ctx);
9274#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9275 T = StructType::getTypeByName(Ctx, StructName); \
9276 if (!T) \
9277 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9278 VarName = T; \
9279 VarName##Ptr = PointerType::getUnqual(Ctx);
9280#include "llvm/Frontend/OpenMP/OMPKinds.def"
9281}
9282
9285 SmallVectorImpl<BasicBlock *> &BlockVector) {
9287 BlockSet.insert(EntryBB);
9288 BlockSet.insert(ExitBB);
9289
9290 Worklist.push_back(EntryBB);
9291 while (!Worklist.empty()) {
9292 BasicBlock *BB = Worklist.pop_back_val();
9293 BlockVector.push_back(BB);
9294 for (BasicBlock *SuccBB : successors(BB))
9295 if (BlockSet.insert(SuccBB).second)
9296 Worklist.push_back(SuccBB);
9297 }
9298}
9299
9301 uint64_t Size, int32_t Flags,
9303 StringRef Name) {
9304 if (!Config.isGPU()) {
9307 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
9308 return;
9309 }
9310 // TODO: Add support for global variables on the device after declare target
9311 // support.
9312 Function *Fn = dyn_cast<Function>(Addr);
9313 if (!Fn)
9314 return;
9315
9316 // Add a function attribute for the kernel.
9317 Fn->addFnAttr("kernel");
9318 if (T.isAMDGCN())
9319 Fn->addFnAttr("uniform-work-group-size", "true");
9320 Fn->addFnAttr(Attribute::MustProgress);
9321}
9322
9323// We only generate metadata for function that contain target regions.
9326
9327 // If there are no entries, we don't need to do anything.
9329 return;
9330
9334 16>
9335 OrderedEntries(OffloadInfoManager.size());
9336
9337 // Auxiliary methods to create metadata values and strings.
9338 auto &&GetMDInt = [this](unsigned V) {
9339 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9340 };
9341
9342 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9343
9344 // Create the offloading info metadata node.
9345 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9346 auto &&TargetRegionMetadataEmitter =
9347 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9348 const TargetRegionEntryInfo &EntryInfo,
9350 // Generate metadata for target regions. Each entry of this metadata
9351 // contains:
9352 // - Entry 0 -> Kind of this type of metadata (0).
9353 // - Entry 1 -> Device ID of the file where the entry was identified.
9354 // - Entry 2 -> File ID of the file where the entry was identified.
9355 // - Entry 3 -> Mangled name of the function where the entry was
9356 // identified.
9357 // - Entry 4 -> Line in the file where the entry was identified.
9358 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9359 // - Entry 6 -> Order the entry was created.
9360 // The first element of the metadata node is the kind.
9361 Metadata *Ops[] = {
9362 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9363 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9364 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9365 GetMDInt(E.getOrder())};
9366
9367 // Save this entry in the right position of the ordered entries array.
9368 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9369
9370 // Add metadata to the named metadata node.
9371 MD->addOperand(MDNode::get(C, Ops));
9372 };
9373
9374 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9375
9376 // Create function that emits metadata for each device global variable entry;
9377 auto &&DeviceGlobalVarMetadataEmitter =
9378 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9379 StringRef MangledName,
9381 // Generate metadata for global variables. Each entry of this metadata
9382 // contains:
9383 // - Entry 0 -> Kind of this type of metadata (1).
9384 // - Entry 1 -> Mangled name of the variable.
9385 // - Entry 2 -> Declare target kind.
9386 // - Entry 3 -> Order the entry was created.
9387 // The first element of the metadata node is the kind.
9388 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9389 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9390
9391 // Save this entry in the right position of the ordered entries array.
9392 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9393 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9394
9395 // Add metadata to the named metadata node.
9396 MD->addOperand(MDNode::get(C, Ops));
9397 };
9398
9400 DeviceGlobalVarMetadataEmitter);
9401
9402 for (const auto &E : OrderedEntries) {
9403 assert(E.first && "All ordered entries must exist!");
9404 if (const auto *CE =
9405 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9406 E.first)) {
9407 if (!CE->getID() || !CE->getAddress()) {
9408 // Do not blame the entry if the parent funtion is not emitted.
9409 TargetRegionEntryInfo EntryInfo = E.second;
9410 StringRef FnName = EntryInfo.ParentName;
9411 if (!M.getNamedValue(FnName))
9412 continue;
9413 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9414 continue;
9415 }
9416 createOffloadEntry(CE->getID(), CE->getAddress(),
9417 /*Size=*/0, CE->getFlags(),
9419 } else if (const auto *CE = dyn_cast<
9421 E.first)) {
9424 CE->getFlags());
9425 switch (Flags) {
9429 continue;
9430 if (!CE->getAddress()) {
9431 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9432 continue;
9433 }
9434 // The vaiable has no definition - no need to add the entry.
9435 if (CE->getVarSize() == 0)
9436 continue;
9437 break;
9439 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9440 (!Config.isTargetDevice() && CE->getAddress())) &&
9441 "Declaret target link address is set.");
9442 if (Config.isTargetDevice())
9443 continue;
9444 if (!CE->getAddress()) {
9446 continue;
9447 }
9448 break;
9449 default:
9450 break;
9451 }
9452
9453 // Hidden or internal symbols on the device are not externally visible.
9454 // We should not attempt to register them by creating an offloading
9455 // entry. Indirect variables are handled separately on the device.
9456 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9457 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9459 continue;
9460
9461 // Indirect globals need to use a special name that doesn't match the name
9462 // of the associated host global.
9464 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9465 Flags, CE->getLinkage(), CE->getVarName());
9466 else
9467 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9468 Flags, CE->getLinkage());
9469
9470 } else {
9471 llvm_unreachable("Unsupported entry kind.");
9472 }
9473 }
9474
9475 // Emit requires directive globals to a special entry so the runtime can
9476 // register them when the device image is loaded.
9477 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9478 // entries should be redesigned to better suit this use-case.
9483 ".requires", /*Size=*/0,
9486}
9487
9489 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9490 unsigned FileID, unsigned Line, unsigned Count) {
9492 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9493 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9494 if (Count)
9495 OS << "_" << Count;
9496}
9497
9500 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9502 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9503 EntryInfo.Line, NewCount);
9504}
9505
9508 StringRef ParentName) {
9510 auto FileIDInfo = CallBack();
9511 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9512 report_fatal_error(("Unable to get unique ID for file, during "
9513 "getTargetEntryUniqueInfo, error message: " +
9514 EC.message())
9515 .c_str());
9516 }
9517
9518 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9519 std::get<1>(FileIDInfo));
9520}
9521
9523 unsigned Offset = 0;
9524 for (uint64_t Remain =
9525 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9527 !(Remain & 1); Remain = Remain >> 1)
9528 Offset++;
9529 return Offset;
9530}
9531
9534 // Rotate by getFlagMemberOffset() bits.
9535 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9536 << getFlagMemberOffset());
9537}
9538
9541 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9542 // If the entry is PTR_AND_OBJ but has not been marked with the special
9543 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9544 // marked as MEMBER_OF.
9545 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9547 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9550 return;
9551
9552 // Reset the placeholder value to prepare the flag for the assignment of the
9553 // proper MEMBER_OF value.
9554 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9555 Flags |= MemberOfFlag;
9556}
9557
9561 bool IsDeclaration, bool IsExternallyVisible,
9562 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9563 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9564 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9565 std::function<Constant *()> GlobalInitializer,
9566 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9567 // TODO: convert this to utilise the IRBuilder Config rather than
9568 // a passed down argument.
9569 if (OpenMPSIMD)
9570 return nullptr;
9571
9574 CaptureClause ==
9577 SmallString<64> PtrName;
9578 {
9579 raw_svector_ostream OS(PtrName);
9580 OS << MangledName;
9581 if (!IsExternallyVisible)
9582 OS << format("_%x", EntryInfo.FileID);
9583 OS << "_decl_tgt_ref_ptr";
9584 }
9585
9586 Value *Ptr = M.getNamedValue(PtrName);
9587
9588 if (!Ptr) {
9589 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9590 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9591
9592 auto *GV = cast<GlobalVariable>(Ptr);
9593 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9594
9595 if (!Config.isTargetDevice()) {
9596 if (GlobalInitializer)
9597 GV->setInitializer(GlobalInitializer());
9598 else
9599 GV->setInitializer(GlobalValue);
9600 }
9601
9603 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9604 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9605 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9606 }
9607
9608 return cast<Constant>(Ptr);
9609 }
9610
9611 return nullptr;
9612}
9613
9617 bool IsDeclaration, bool IsExternallyVisible,
9618 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9619 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9620 std::vector<Triple> TargetTriple,
9621 std::function<Constant *()> GlobalInitializer,
9622 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9623 Constant *Addr) {
9625 (TargetTriple.empty() && !Config.isTargetDevice()))
9626 return;
9627
9629 StringRef VarName;
9630 int64_t VarSize;
9632
9634 CaptureClause ==
9638 VarName = MangledName;
9639 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9640
9641 if (!IsDeclaration)
9642 VarSize = divideCeil(
9644 else
9645 VarSize = 0;
9646 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9647
9648 // This is a workaround carried over from Clang which prevents undesired
9649 // optimisation of internal variables.
9650 if (Config.isTargetDevice() &&
9651 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9652 // Do not create a "ref-variable" if the original is not also available
9653 // on the host.
9655 return;
9656
9657 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9658
9659 if (!M.getNamedValue(RefName)) {
9660 Constant *AddrRef =
9661 getOrCreateInternalVariable(Addr->getType(), RefName);
9662 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9663 GvAddrRef->setConstant(true);
9664 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9665 GvAddrRef->setInitializer(Addr);
9666 GeneratedRefs.push_back(GvAddrRef);
9667 }
9668 }
9669 } else {
9672 else
9674
9675 if (Config.isTargetDevice()) {
9676 VarName = (Addr) ? Addr->getName() : "";
9677 Addr = nullptr;
9678 } else {
9680 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9681 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9682 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9683 VarName = (Addr) ? Addr->getName() : "";
9684 }
9685 VarSize = M.getDataLayout().getPointerSize();
9687 }
9688
9690 Flags, Linkage);
9691}
9692
9693/// Loads all the offload entries information from the host IR
9694/// metadata.
9696 // If we are in target mode, load the metadata from the host IR. This code has
9697 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9698
9700 if (!MD)
9701 return;
9702
9703 for (MDNode *MN : MD->operands()) {
9704 auto &&GetMDInt = [MN](unsigned Idx) {
9705 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9706 return cast<ConstantInt>(V->getValue())->getZExtValue();
9707 };
9708
9709 auto &&GetMDString = [MN](unsigned Idx) {
9710 auto *V = cast<MDString>(MN->getOperand(Idx));
9711 return V->getString();
9712 };
9713
9714 switch (GetMDInt(0)) {
9715 default:
9716 llvm_unreachable("Unexpected metadata!");
9717 break;
9720 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9721 /*DeviceID=*/GetMDInt(1),
9722 /*FileID=*/GetMDInt(2),
9723 /*Line=*/GetMDInt(4),
9724 /*Count=*/GetMDInt(5));
9726 /*Order=*/GetMDInt(6));
9727 break;
9728 }
9732 /*MangledName=*/GetMDString(1),
9734 /*Flags=*/GetMDInt(2)),
9735 /*Order=*/GetMDInt(3));
9736 break;
9737 }
9738 }
9739}
9740
9742 if (HostFilePath.empty())
9743 return;
9744
9745 auto Buf = MemoryBuffer::getFile(HostFilePath);
9746 if (std::error_code Err = Buf.getError()) {
9747 report_fatal_error(("error opening host file from host file path inside of "
9748 "OpenMPIRBuilder: " +
9749 Err.message())
9750 .c_str());
9751 }
9752
9753 LLVMContext Ctx;
9755 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9756 if (std::error_code Err = M.getError()) {
9758 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9759 .c_str());
9760 }
9761
9762 loadOffloadInfoMetadata(*M.get());
9763}
9764
9765//===----------------------------------------------------------------------===//
9766// OffloadEntriesInfoManager
9767//===----------------------------------------------------------------------===//
9768
9770 return OffloadEntriesTargetRegion.empty() &&
9771 OffloadEntriesDeviceGlobalVar.empty();
9772}
9773
9774unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9775 const TargetRegionEntryInfo &EntryInfo) const {
9776 auto It = OffloadEntriesTargetRegionCount.find(
9777 getTargetRegionEntryCountKey(EntryInfo));
9778 if (It == OffloadEntriesTargetRegionCount.end())
9779 return 0;
9780 return It->second;
9781}
9782
9783void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9784 const TargetRegionEntryInfo &EntryInfo) {
9785 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9786 EntryInfo.Count + 1;
9787}
9788
9789/// Initialize target region entry.
9791 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9792 OffloadEntriesTargetRegion[EntryInfo] =
9793 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9794 OMPTargetRegionEntryTargetRegion);
9795 ++OffloadingEntriesNum;
9796}
9797
9801 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9802
9803 // Update the EntryInfo with the next available count for this location.
9804 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9805
9806 // If we are emitting code for a target, the entry is already initialized,
9807 // only has to be registered.
9808 if (OMPBuilder->Config.isTargetDevice()) {
9809 // This could happen if the device compilation is invoked standalone.
9810 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9811 return;
9812 }
9813 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9814 Entry.setAddress(Addr);
9815 Entry.setID(ID);
9816 Entry.setFlags(Flags);
9817 } else {
9819 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9820 return;
9821 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9822 "Target region entry already registered!");
9823 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9824 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9825 ++OffloadingEntriesNum;
9826 }
9827 incrementTargetRegionEntryInfoCount(EntryInfo);
9828}
9829
9831 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9832
9833 // Update the EntryInfo with the next available count for this location.
9834 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9835
9836 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9837 if (It == OffloadEntriesTargetRegion.end()) {
9838 return false;
9839 }
9840 // Fail if this entry is already registered.
9841 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9842 return false;
9843 return true;
9844}
9845
9847 const OffloadTargetRegionEntryInfoActTy &Action) {
9848 // Scan all target region entries and perform the provided action.
9849 for (const auto &It : OffloadEntriesTargetRegion) {
9850 Action(It.first, It.second);
9851 }
9852}
9853
9855 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9856 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9857 ++OffloadingEntriesNum;
9858}
9859
9861 StringRef VarName, Constant *Addr, int64_t VarSize,
9863 if (OMPBuilder->Config.isTargetDevice()) {
9864 // This could happen if the device compilation is invoked standalone.
9865 if (!hasDeviceGlobalVarEntryInfo(VarName))
9866 return;
9867 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9868 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9869 if (Entry.getVarSize() == 0) {
9870 Entry.setVarSize(VarSize);
9871 Entry.setLinkage(Linkage);
9872 }
9873 return;
9874 }
9875 Entry.setVarSize(VarSize);
9876 Entry.setLinkage(Linkage);
9877 Entry.setAddress(Addr);
9878 } else {
9879 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9880 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9881 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9882 "Entry not initialized!");
9883 if (Entry.getVarSize() == 0) {
9884 Entry.setVarSize(VarSize);
9885 Entry.setLinkage(Linkage);
9886 }
9887 return;
9888 }
9890 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9891 Addr, VarSize, Flags, Linkage,
9892 VarName.str());
9893 else
9894 OffloadEntriesDeviceGlobalVar.try_emplace(
9895 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9896 ++OffloadingEntriesNum;
9897 }
9898}
9899
9902 // Scan all target region entries and perform the provided action.
9903 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9904 Action(E.getKey(), E.getValue());
9905}
9906
9907//===----------------------------------------------------------------------===//
9908// CanonicalLoopInfo
9909//===----------------------------------------------------------------------===//
9910
9911void CanonicalLoopInfo::collectControlBlocks(
9913 // We only count those BBs as control block for which we do not need to
9914 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9915 // flow. For consistency, this also means we do not add the Body block, which
9916 // is just the entry to the body code.
9917 BBs.reserve(BBs.size() + 6);
9918 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9919}
9920
9922 assert(isValid() && "Requires a valid canonical loop");
9923 for (BasicBlock *Pred : predecessors(Header)) {
9924 if (Pred != Latch)
9925 return Pred;
9926 }
9927 llvm_unreachable("Missing preheader");
9928}
9929
9930void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9931 assert(isValid() && "Requires a valid canonical loop");
9932
9933 Instruction *CmpI = &getCond()->front();
9934 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9935 CmpI->setOperand(1, TripCount);
9936
9937#ifndef NDEBUG
9938 assertOK();
9939#endif
9940}
9941
9942void CanonicalLoopInfo::mapIndVar(
9943 llvm::function_ref<Value *(Instruction *)> Updater) {
9944 assert(isValid() && "Requires a valid canonical loop");
9945
9946 Instruction *OldIV = getIndVar();
9947
9948 // Record all uses excluding those introduced by the updater. Uses by the
9949 // CanonicalLoopInfo itself to keep track of the number of iterations are
9950 // excluded.
9951 SmallVector<Use *> ReplacableUses;
9952 for (Use &U : OldIV->uses()) {
9953 auto *User = dyn_cast<Instruction>(U.getUser());
9954 if (!User)
9955 continue;
9956 if (User->getParent() == getCond())
9957 continue;
9958 if (User->getParent() == getLatch())
9959 continue;
9960 ReplacableUses.push_back(&U);
9961 }
9962
9963 // Run the updater that may introduce new uses
9964 Value *NewIV = Updater(OldIV);
9965
9966 // Replace the old uses with the value returned by the updater.
9967 for (Use *U : ReplacableUses)
9968 U->set(NewIV);
9969
9970#ifndef NDEBUG
9971 assertOK();
9972#endif
9973}
9974
9976#ifndef NDEBUG
9977 // No constraints if this object currently does not describe a loop.
9978 if (!isValid())
9979 return;
9980
9981 BasicBlock *Preheader = getPreheader();
9982 BasicBlock *Body = getBody();
9983 BasicBlock *After = getAfter();
9984
9985 // Verify standard control-flow we use for OpenMP loops.
9986 assert(Preheader);
9987 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9988 "Preheader must terminate with unconditional branch");
9989 assert(Preheader->getSingleSuccessor() == Header &&
9990 "Preheader must jump to header");
9991
9992 assert(Header);
9993 assert(isa<BranchInst>(Header->getTerminator()) &&
9994 "Header must terminate with unconditional branch");
9995 assert(Header->getSingleSuccessor() == Cond &&
9996 "Header must jump to exiting block");
9997
9998 assert(Cond);
9999 assert(Cond->getSinglePredecessor() == Header &&
10000 "Exiting block only reachable from header");
10001
10002 assert(isa<BranchInst>(Cond->getTerminator()) &&
10003 "Exiting block must terminate with conditional branch");
10004 assert(size(successors(Cond)) == 2 &&
10005 "Exiting block must have two successors");
10006 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10007 "Exiting block's first successor jump to the body");
10008 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10009 "Exiting block's second successor must exit the loop");
10010
10011 assert(Body);
10012 assert(Body->getSinglePredecessor() == Cond &&
10013 "Body only reachable from exiting block");
10014 assert(!isa<PHINode>(Body->front()));
10015
10016 assert(Latch);
10017 assert(isa<BranchInst>(Latch->getTerminator()) &&
10018 "Latch must terminate with unconditional branch");
10019 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10020 // TODO: To support simple redirecting of the end of the body code that has
10021 // multiple; introduce another auxiliary basic block like preheader and after.
10022 assert(Latch->getSinglePredecessor() != nullptr);
10023 assert(!isa<PHINode>(Latch->front()));
10024
10025 assert(Exit);
10026 assert(isa<BranchInst>(Exit->getTerminator()) &&
10027 "Exit block must terminate with unconditional branch");
10028 assert(Exit->getSingleSuccessor() == After &&
10029 "Exit block must jump to after block");
10030
10031 assert(After);
10032 assert(After->getSinglePredecessor() == Exit &&
10033 "After block only reachable from exit block");
10034 assert(After->empty() || !isa<PHINode>(After->front()));
10035
10036 Instruction *IndVar = getIndVar();
10037 assert(IndVar && "Canonical induction variable not found?");
10038 assert(isa<IntegerType>(IndVar->getType()) &&
10039 "Induction variable must be an integer");
10040 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10041 "Induction variable must be a PHI in the loop header");
10042 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10043 assert(
10044 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10045 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10046
10047 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10048 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10049 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10050 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10051 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10052 ->isOne());
10053
10054 Value *TripCount = getTripCount();
10055 assert(TripCount && "Loop trip count not found?");
10056 assert(IndVar->getType() == TripCount->getType() &&
10057 "Trip count and induction variable must have the same type");
10058
10059 auto *CmpI = cast<CmpInst>(&Cond->front());
10060 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10061 "Exit condition must be a signed less-than comparison");
10062 assert(CmpI->getOperand(0) == IndVar &&
10063 "Exit condition must compare the induction variable");
10064 assert(CmpI->getOperand(1) == TripCount &&
10065 "Exit condition must compare with the trip count");
10066#endif
10067}
10068
10070 Header = nullptr;
10071 Cond = nullptr;
10072 Latch = nullptr;
10073 Exit = nullptr;
10074}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static llvm::Value * emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, llvm::Value *V)
Emit an implicit cast to convert XRead to type of variable V.
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned unsigned DefaultVal
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:245
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition: Argument.h:49
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:600
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:937
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:922
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:396
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:684
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:437
reverse_iterator rbegin()
Definition: BasicBlock.h:477
bool empty() const
Definition: BasicBlock.h:483
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:381
const Instruction & front() const
Definition: BasicBlock.h:484
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:213
InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:398
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:599
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:519
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:481
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:489
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:511
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:389
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:644
const Instruction & back() const
Definition: BasicBlock.h:486
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:538
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1943
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:532
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2991
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
A scope for locals.
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
static DILocalScope * cloneScopeForSubprogram(DILocalScope &RootScope, DISubprogram &NewSP, LLVMContext &Ctx, DenseMap< const MDNode *, MDNode * > &Cache)
Traverses the scope chain rooted at RootScope until it hits a Subprogram, recreating the chain with "...
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
Debug location.
DIFile * getFile() const
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:815
bool empty() const
Definition: Function.h:865
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:359
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:859
arg_iterator arg_begin()
Definition: Function.h:874
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:362
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:760
size_t arg_size() const
Definition: Function.h:907
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:861
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:892
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:547
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:538
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
void setDSOLocal(bool Local)
Definition: GlobalValue.h:304
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:295
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:255
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:297
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1417
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1075
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2286
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1849
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2106
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2294
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2051
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1306
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1265
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1980
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2045
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1379
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1882
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2211
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1421
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1383
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1733
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
Value * CreateFPCast(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2246
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1187
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1518
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1134
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1921
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1967
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1430
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2588
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1862
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2302
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2282
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2583
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1562
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2086
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:426
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:508
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1557
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1432
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1737
iterator_range< op_iterator > operands()
Definition: Metadata.h:1833
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:246
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:248
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:379
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:381
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:299
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:301
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:290
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:359
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:365
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:371
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:361
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:435
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:95
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:191
StringRef separator() const
Definition: OMPIRBuilder.h:177
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:167
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:108
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:150
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:144
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:187
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:476
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:545
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:522
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:525
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:679
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:695
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:996
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1054
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1064
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:130
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:146
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:946
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
Definition: CallingConv.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:647
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:205
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61