LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr const size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510 Builder.CreateStore(Reduced, LHSPtr);
3511 }
3512 }
3513
3514 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3515 for (auto En : enumerate(ReductionInfos)) {
3516 unsigned Index = En.index();
3517 const ReductionInfo &RI = En.value();
3518 Value *LHSFixupPtr, *RHSFixupPtr;
3519 Builder.restoreIP(RI.ReductionGenClang(
3520 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3521
3522 // Fix the CallBack code genereated to use the correct Values for the LHS
3523 // and RHS
3524 LHSFixupPtr->replaceUsesWithIf(
3525 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3526 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3527 ReductionFunc;
3528 });
3529 RHSFixupPtr->replaceUsesWithIf(
3530 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3531 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3532 ReductionFunc;
3533 });
3534 }
3535
3536 Builder.CreateRetVoid();
3537 return ReductionFunc;
3538}
3539
3540static void
3542 bool IsGPU) {
3543 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3544 (void)RI;
3545 assert(RI.Variable && "expected non-null variable");
3546 assert(RI.PrivateVariable && "expected non-null private variable");
3547 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3548 "expected non-null reduction generator callback");
3549 if (!IsGPU) {
3550 assert(
3551 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3552 "expected variables and their private equivalents to have the same "
3553 "type");
3554 }
3555 assert(RI.Variable->getType()->isPointerTy() &&
3556 "expected variables to be pointers");
3557 }
3558}
3559
3560OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3561 const LocationDescription &Loc, InsertPointTy AllocaIP,
3562 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3563 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3564 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3565 Value *SrcLocInfo) {
3566 if (!updateToLocation(Loc))
3567 return InsertPointTy();
3568 Builder.restoreIP(CodeGenIP);
3569 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3570 LLVMContext &Ctx = M.getContext();
3571
3572 // Source location for the ident struct
3573 if (!SrcLocInfo) {
3574 uint32_t SrcLocStrSize;
3575 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3576 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3577 }
3578
3579 if (ReductionInfos.size() == 0)
3580 return Builder.saveIP();
3581
3582 BasicBlock *ContinuationBlock = nullptr;
3583 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3584 // Copied code from createReductions
3585 BasicBlock *InsertBlock = Loc.IP.getBlock();
3586 ContinuationBlock =
3587 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3588 InsertBlock->getTerminator()->eraseFromParent();
3589 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3590 }
3591
3592 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3593 AttributeList FuncAttrs;
3594 AttrBuilder AttrBldr(Ctx);
3595 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3596 AttrBldr.addAttribute(Attr);
3597 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3598 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3599
3600 CodeGenIP = Builder.saveIP();
3601 Expected<Function *> ReductionResult =
3602 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3603 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3604 if (!ReductionResult)
3605 return ReductionResult.takeError();
3606 Function *ReductionFunc = *ReductionResult;
3607 Builder.restoreIP(CodeGenIP);
3608
3609 // Set the grid value in the config needed for lowering later on
3610 if (GridValue.has_value())
3611 Config.setGridValue(GridValue.value());
3612 else
3613 Config.setGridValue(getGridValue(T, ReductionFunc));
3614
3615 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3616 // RedList, shuffle_reduce_func, interwarp_copy_func);
3617 // or
3618 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3619 Value *Res;
3620
3621 // 1. Build a list of reduction variables.
3622 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3623 auto Size = ReductionInfos.size();
3624 Type *PtrTy = PointerType::getUnqual(Ctx);
3625 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3626 CodeGenIP = Builder.saveIP();
3627 Builder.restoreIP(AllocaIP);
3628 Value *ReductionListAlloca =
3629 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3630 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3631 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3632 Builder.restoreIP(CodeGenIP);
3633 Type *IndexTy = Builder.getIndexTy(
3634 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3635 for (auto En : enumerate(ReductionInfos)) {
3636 const ReductionInfo &RI = En.value();
3637 Value *ElemPtr = Builder.CreateInBoundsGEP(
3638 RedArrayTy, ReductionList,
3639 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3640 Value *CastElem =
3641 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3642 Builder.CreateStore(CastElem, ElemPtr);
3643 }
3644 CodeGenIP = Builder.saveIP();
3645 Function *SarFunc =
3646 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3647 Expected<Function *> CopyResult =
3648 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3649 if (!CopyResult)
3650 return CopyResult.takeError();
3651 Function *WcFunc = *CopyResult;
3652 Builder.restoreIP(CodeGenIP);
3653
3654 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3655
3656 unsigned MaxDataSize = 0;
3657 SmallVector<Type *> ReductionTypeArgs;
3658 for (auto En : enumerate(ReductionInfos)) {
3659 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3660 if (Size > MaxDataSize)
3661 MaxDataSize = Size;
3662 ReductionTypeArgs.emplace_back(En.value().ElementType);
3663 }
3664 Value *ReductionDataSize =
3665 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3666 if (!IsTeamsReduction) {
3667 Value *SarFuncCast =
3668 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
3669 Value *WcFuncCast =
3670 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
3671 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3672 WcFuncCast};
3673 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3674 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3675 Res = Builder.CreateCall(Pv2Ptr, Args);
3676 } else {
3677 CodeGenIP = Builder.saveIP();
3678 StructType *ReductionsBufferTy = StructType::create(
3679 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3680 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3681 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3682 Function *LtGCFunc = emitListToGlobalCopyFunction(
3683 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3684 Function *LtGRFunc = emitListToGlobalReduceFunction(
3685 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3686 Function *GtLCFunc = emitGlobalToListCopyFunction(
3687 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3688 Function *GtLRFunc = emitGlobalToListReduceFunction(
3689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3690 Builder.restoreIP(CodeGenIP);
3691
3692 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3693 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3694
3695 Value *Args3[] = {SrcLocInfo,
3696 KernelTeamsReductionPtr,
3697 Builder.getInt32(ReductionBufNum),
3698 ReductionDataSize,
3699 RL,
3700 SarFunc,
3701 WcFunc,
3702 LtGCFunc,
3703 LtGRFunc,
3704 GtLCFunc,
3705 GtLRFunc};
3706
3707 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3708 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3709 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3710 }
3711
3712 // 5. Build if (res == 1)
3713 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3714 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3715 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3716 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3717
3718 // 6. Build then branch: where we have reduced values in the master
3719 // thread in each team.
3720 // __kmpc_end_reduce{_nowait}(<gtid>);
3721 // break;
3722 emitBlock(ThenBB, CurFunc);
3723
3724 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3725 for (auto En : enumerate(ReductionInfos)) {
3726 const ReductionInfo &RI = En.value();
3727 Value *LHS = RI.Variable;
3728 Value *RHS =
3729 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3730
3731 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3732 Value *LHSPtr, *RHSPtr;
3733 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3734 &LHSPtr, &RHSPtr, CurFunc));
3735
3736 // Fix the CallBack code genereated to use the correct Values for the LHS
3737 // and RHS
3738 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3739 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3740 ReductionFunc;
3741 });
3742 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3743 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3744 ReductionFunc;
3745 });
3746 } else {
3747 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3748 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3749 Value *Reduced;
3750 InsertPointOrErrorTy AfterIP =
3751 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3752 if (!AfterIP)
3753 return AfterIP.takeError();
3754 Builder.CreateStore(Reduced, LHS, false);
3755 }
3756 }
3757 emitBlock(ExitBB, CurFunc);
3758 if (ContinuationBlock) {
3759 Builder.CreateBr(ContinuationBlock);
3760 Builder.SetInsertPoint(ContinuationBlock);
3761 }
3762 Config.setEmitLLVMUsed();
3763
3764 return Builder.saveIP();
3765}
3766
3768 Type *VoidTy = Type::getVoidTy(M.getContext());
3769 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3770 auto *FuncTy =
3771 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3773 ".omp.reduction.func", &M);
3774}
3775
3777 Function *ReductionFunc,
3779 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3780 Module *Module = ReductionFunc->getParent();
3781 BasicBlock *ReductionFuncBlock =
3782 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3783 Builder.SetInsertPoint(ReductionFuncBlock);
3784 Value *LHSArrayPtr = nullptr;
3785 Value *RHSArrayPtr = nullptr;
3786 if (IsGPU) {
3787 // Need to alloca memory here and deal with the pointers before getting
3788 // LHS/RHS pointers out
3789 //
3790 Argument *Arg0 = ReductionFunc->getArg(0);
3791 Argument *Arg1 = ReductionFunc->getArg(1);
3792 Type *Arg0Type = Arg0->getType();
3793 Type *Arg1Type = Arg1->getType();
3794
3795 Value *LHSAlloca =
3796 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3797 Value *RHSAlloca =
3798 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3799 Value *LHSAddrCast =
3800 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3801 Value *RHSAddrCast =
3802 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3803 Builder.CreateStore(Arg0, LHSAddrCast);
3804 Builder.CreateStore(Arg1, RHSAddrCast);
3805 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3806 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3807 } else {
3808 LHSArrayPtr = ReductionFunc->getArg(0);
3809 RHSArrayPtr = ReductionFunc->getArg(1);
3810 }
3811
3812 unsigned NumReductions = ReductionInfos.size();
3813 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3814
3815 for (auto En : enumerate(ReductionInfos)) {
3816 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3817 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3818 RedArrayTy, LHSArrayPtr, 0, En.index());
3819 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3820 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3821 LHSI8Ptr, RI.Variable->getType());
3822 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3823 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3824 RedArrayTy, RHSArrayPtr, 0, En.index());
3825 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3826 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3827 RHSI8Ptr, RI.PrivateVariable->getType());
3828 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3829 Value *Reduced;
3830 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3831 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3832 if (!AfterIP)
3833 return AfterIP.takeError();
3834
3835 Builder.restoreIP(*AfterIP);
3836 // TODO: Consider flagging an error.
3837 if (!Builder.GetInsertBlock())
3838 return Error::success();
3839
3840 // store is inside of the reduction region when using by-ref
3841 if (!IsByRef[En.index()])
3842 Builder.CreateStore(Reduced, LHSPtr);
3843 }
3844 Builder.CreateRetVoid();
3845 return Error::success();
3846}
3847
3848OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3849 const LocationDescription &Loc, InsertPointTy AllocaIP,
3850 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3851 bool IsNoWait, bool IsTeamsReduction) {
3852 assert(ReductionInfos.size() == IsByRef.size());
3853 if (Config.isGPU())
3854 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3855 IsNoWait, IsTeamsReduction);
3856
3857 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3858
3859 if (!updateToLocation(Loc))
3860 return InsertPointTy();
3861
3862 if (ReductionInfos.size() == 0)
3863 return Builder.saveIP();
3864
3865 BasicBlock *InsertBlock = Loc.IP.getBlock();
3866 BasicBlock *ContinuationBlock =
3867 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3868 InsertBlock->getTerminator()->eraseFromParent();
3869
3870 // Create and populate array of type-erased pointers to private reduction
3871 // values.
3872 unsigned NumReductions = ReductionInfos.size();
3873 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3874 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3875 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3876
3877 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3878
3879 for (auto En : enumerate(ReductionInfos)) {
3880 unsigned Index = En.index();
3881 const ReductionInfo &RI = En.value();
3882 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3883 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3884 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3885 }
3886
3887 // Emit a call to the runtime function that orchestrates the reduction.
3888 // Declare the reduction function in the process.
3889 Type *IndexTy = Builder.getIndexTy(
3890 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3891 Function *Func = Builder.GetInsertBlock()->getParent();
3892 Module *Module = Func->getParent();
3893 uint32_t SrcLocStrSize;
3894 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3895 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3896 return RI.AtomicReductionGen;
3897 });
3898 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3899 CanGenerateAtomic
3900 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3901 : IdentFlag(0));
3902 Value *ThreadId = getOrCreateThreadID(Ident);
3903 Constant *NumVariables = Builder.getInt32(NumReductions);
3904 const DataLayout &DL = Module->getDataLayout();
3905 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3906 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3907 Function *ReductionFunc = getFreshReductionFunc(*Module);
3908 Value *Lock = getOMPCriticalRegionLock(".reduction");
3909 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3910 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3911 : RuntimeFunction::OMPRTL___kmpc_reduce);
3912 CallInst *ReduceCall =
3913 Builder.CreateCall(ReduceFunc,
3914 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3915 ReductionFunc, Lock},
3916 "reduce");
3917
3918 // Create final reduction entry blocks for the atomic and non-atomic case.
3919 // Emit IR that dispatches control flow to one of the blocks based on the
3920 // reduction supporting the atomic mode.
3921 BasicBlock *NonAtomicRedBlock =
3922 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3923 BasicBlock *AtomicRedBlock =
3924 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3926 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3927 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3928 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3929
3930 // Populate the non-atomic reduction using the elementwise reduction function.
3931 // This loads the elements from the global and private variables and reduces
3932 // them before storing back the result to the global variable.
3933 Builder.SetInsertPoint(NonAtomicRedBlock);
3934 for (auto En : enumerate(ReductionInfos)) {
3935 const ReductionInfo &RI = En.value();
3936 Type *ValueType = RI.ElementType;
3937 // We have one less load for by-ref case because that load is now inside of
3938 // the reduction region
3939 Value *RedValue = RI.Variable;
3940 if (!IsByRef[En.index()]) {
3941 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3942 "red.value." + Twine(En.index()));
3943 }
3944 Value *PrivateRedValue =
3945 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3946 "red.private.value." + Twine(En.index()));
3947 Value *Reduced;
3948 InsertPointOrErrorTy AfterIP =
3949 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3950 if (!AfterIP)
3951 return AfterIP.takeError();
3952 Builder.restoreIP(*AfterIP);
3953
3954 if (!Builder.GetInsertBlock())
3955 return InsertPointTy();
3956 // for by-ref case, the load is inside of the reduction region
3957 if (!IsByRef[En.index()])
3958 Builder.CreateStore(Reduced, RI.Variable);
3959 }
3960 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3961 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3962 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3963 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3964 Builder.CreateBr(ContinuationBlock);
3965
3966 // Populate the atomic reduction using the atomic elementwise reduction
3967 // function. There are no loads/stores here because they will be happening
3968 // inside the atomic elementwise reduction.
3969 Builder.SetInsertPoint(AtomicRedBlock);
3970 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3971 for (const ReductionInfo &RI : ReductionInfos) {
3972 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3973 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3974 if (!AfterIP)
3975 return AfterIP.takeError();
3976 Builder.restoreIP(*AfterIP);
3977 if (!Builder.GetInsertBlock())
3978 return InsertPointTy();
3979 }
3980 Builder.CreateBr(ContinuationBlock);
3981 } else {
3982 Builder.CreateUnreachable();
3983 }
3984
3985 // Populate the outlined reduction function using the elementwise reduction
3986 // function. Partial values are extracted from the type-erased array of
3987 // pointers to private variables.
3988 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3989 IsByRef, /*isGPU=*/false);
3990 if (Err)
3991 return Err;
3992
3993 if (!Builder.GetInsertBlock())
3994 return InsertPointTy();
3995
3996 Builder.SetInsertPoint(ContinuationBlock);
3997 return Builder.saveIP();
3998}
3999
4000OpenMPIRBuilder::InsertPointOrErrorTy
4001OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4002 BodyGenCallbackTy BodyGenCB,
4003 FinalizeCallbackTy FiniCB) {
4004 if (!updateToLocation(Loc))
4005 return Loc.IP;
4006
4007 Directive OMPD = Directive::OMPD_master;
4008 uint32_t SrcLocStrSize;
4009 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4010 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4011 Value *ThreadId = getOrCreateThreadID(Ident);
4012 Value *Args[] = {Ident, ThreadId};
4013
4014 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4015 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4016
4017 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4018 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4019
4020 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4021 /*Conditional*/ true, /*hasFinalize*/ true);
4022}
4023
4024OpenMPIRBuilder::InsertPointOrErrorTy
4025OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4026 BodyGenCallbackTy BodyGenCB,
4027 FinalizeCallbackTy FiniCB, Value *Filter) {
4028 if (!updateToLocation(Loc))
4029 return Loc.IP;
4030
4031 Directive OMPD = Directive::OMPD_masked;
4032 uint32_t SrcLocStrSize;
4033 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4034 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4035 Value *ThreadId = getOrCreateThreadID(Ident);
4036 Value *Args[] = {Ident, ThreadId, Filter};
4037 Value *ArgsEnd[] = {Ident, ThreadId};
4038
4039 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4040 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4041
4042 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4043 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4044
4045 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4046 /*Conditional*/ true, /*hasFinalize*/ true);
4047}
4048
4050 llvm::FunctionCallee Callee,
4052 const llvm::Twine &Name) {
4053 llvm::CallInst *Call = Builder.CreateCall(
4054 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4055 Call->setDoesNotThrow();
4056 return Call;
4057}
4058
4059// Expects input basic block is dominated by BeforeScanBB.
4060// Once Scan directive is encountered, the code after scan directive should be
4061// dominated by AfterScanBB. Scan directive splits the code sequence to
4062// scan and input phase. Based on whether inclusive or exclusive
4063// clause is used in the scan directive and whether input loop or scan loop
4064// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4065// input loop and second is the scan loop. The code generated handles only
4066// inclusive scans now.
4067OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4068 const LocationDescription &Loc, InsertPointTy AllocaIP,
4069 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4070 bool IsInclusive, ScanInfo *ScanRedInfo) {
4071 if (ScanRedInfo->OMPFirstScanLoop) {
4072 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4073 ScanVarsType, ScanRedInfo);
4074 if (Err)
4075 return Err;
4076 }
4077 if (!updateToLocation(Loc))
4078 return Loc.IP;
4079
4080 llvm::Value *IV = ScanRedInfo->IV;
4081
4082 if (ScanRedInfo->OMPFirstScanLoop) {
4083 // Emit buffer[i] = red; at the end of the input phase.
4084 for (size_t i = 0; i < ScanVars.size(); i++) {
4085 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4086 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4087 Type *DestTy = ScanVarsType[i];
4088 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4089 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4090
4091 Builder.CreateStore(Src, Val);
4092 }
4093 }
4094 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4095 emitBlock(ScanRedInfo->OMPScanDispatch,
4096 Builder.GetInsertBlock()->getParent());
4097
4098 if (!ScanRedInfo->OMPFirstScanLoop) {
4099 IV = ScanRedInfo->IV;
4100 // Emit red = buffer[i]; at the entrance to the scan phase.
4101 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4102 for (size_t i = 0; i < ScanVars.size(); i++) {
4103 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4104 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4105 Type *DestTy = ScanVarsType[i];
4106 Value *SrcPtr =
4107 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4108 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4109 Builder.CreateStore(Src, ScanVars[i]);
4110 }
4111 }
4112
4113 // TODO: Update it to CreateBr and remove dead blocks
4114 llvm::Value *CmpI = Builder.getInt1(true);
4115 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4116 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4117 ScanRedInfo->OMPAfterScanBlock);
4118 } else {
4119 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4120 ScanRedInfo->OMPBeforeScanBlock);
4121 }
4122 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4123 Builder.GetInsertBlock()->getParent());
4124 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4125 return Builder.saveIP();
4126}
4127
4128Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4129 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4130 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4131
4132 Builder.restoreIP(AllocaIP);
4133 // Create the shared pointer at alloca IP.
4134 for (size_t i = 0; i < ScanVars.size(); i++) {
4135 llvm::Value *BuffPtr =
4136 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4137 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4138 }
4139
4140 // Allocate temporary buffer by master thread
4141 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4142 InsertPointTy CodeGenIP) -> Error {
4143 Builder.restoreIP(CodeGenIP);
4144 Value *AllocSpan =
4145 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4146 for (size_t i = 0; i < ScanVars.size(); i++) {
4147 Type *IntPtrTy = Builder.getInt32Ty();
4148 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4149 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4150 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4151 AllocSpan, nullptr, "arr");
4152 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4153 }
4154 return Error::success();
4155 };
4156 // TODO: Perform finalization actions for variables. This has to be
4157 // called for variables which have destructors/finalizers.
4158 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4159
4160 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4161 llvm::Value *FilterVal = Builder.getInt32(0);
4162 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4163 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4164
4165 if (!AfterIP)
4166 return AfterIP.takeError();
4167 Builder.restoreIP(*AfterIP);
4168 BasicBlock *InputBB = Builder.GetInsertBlock();
4169 if (InputBB->getTerminator())
4170 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4171 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4172 if (!AfterIP)
4173 return AfterIP.takeError();
4174 Builder.restoreIP(*AfterIP);
4175
4176 return Error::success();
4177}
4178
4179Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4180 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4181 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4182 InsertPointTy CodeGenIP) -> Error {
4183 Builder.restoreIP(CodeGenIP);
4184 for (ReductionInfo RedInfo : ReductionInfos) {
4185 Value *PrivateVar = RedInfo.PrivateVariable;
4186 Value *OrigVar = RedInfo.Variable;
4187 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4188 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4189
4190 Type *SrcTy = RedInfo.ElementType;
4191 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4192 "arrayOffset");
4193 Value *Src = Builder.CreateLoad(SrcTy, Val);
4194
4195 Builder.CreateStore(Src, OrigVar);
4196 Builder.CreateFree(Buff);
4197 }
4198 return Error::success();
4199 };
4200 // TODO: Perform finalization actions for variables. This has to be
4201 // called for variables which have destructors/finalizers.
4202 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4203
4204 if (ScanRedInfo->OMPScanFinish->getTerminator())
4205 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4206 else
4207 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4208
4209 llvm::Value *FilterVal = Builder.getInt32(0);
4210 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4211 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4212
4213 if (!AfterIP)
4214 return AfterIP.takeError();
4215 Builder.restoreIP(*AfterIP);
4216 BasicBlock *InputBB = Builder.GetInsertBlock();
4217 if (InputBB->getTerminator())
4218 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4219 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4220 if (!AfterIP)
4221 return AfterIP.takeError();
4222 Builder.restoreIP(*AfterIP);
4223 return Error::success();
4224}
4225
4226OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4227 const LocationDescription &Loc,
4229 ScanInfo *ScanRedInfo) {
4230
4231 if (!updateToLocation(Loc))
4232 return Loc.IP;
4233 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4234 InsertPointTy CodeGenIP) -> Error {
4235 Builder.restoreIP(CodeGenIP);
4236 Function *CurFn = Builder.GetInsertBlock()->getParent();
4237 // for (int k = 0; k <= ceil(log2(n)); ++k)
4238 llvm::BasicBlock *LoopBB =
4239 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4240 llvm::BasicBlock *ExitBB =
4241 splitBB(Builder, false, "omp.outer.log.scan.exit");
4243 Builder.GetInsertBlock()->getModule(),
4244 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4245 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4246 llvm::Value *Arg =
4247 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4248 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4250 Builder.GetInsertBlock()->getModule(),
4251 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4252 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4253 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4254 llvm::Value *NMin1 = Builder.CreateNUWSub(
4255 ScanRedInfo->Span,
4256 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4257 Builder.SetInsertPoint(InputBB);
4258 Builder.CreateBr(LoopBB);
4259 emitBlock(LoopBB, CurFn);
4260 Builder.SetInsertPoint(LoopBB);
4261
4262 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4263 // size pow2k = 1;
4264 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4265 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4266 InputBB);
4267 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4268 InputBB);
4269 // for (size i = n - 1; i >= 2 ^ k; --i)
4270 // tmp[i] op= tmp[i-pow2k];
4271 llvm::BasicBlock *InnerLoopBB =
4272 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4273 llvm::BasicBlock *InnerExitBB =
4274 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4275 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4276 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4277 emitBlock(InnerLoopBB, CurFn);
4278 Builder.SetInsertPoint(InnerLoopBB);
4279 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4280 IVal->addIncoming(NMin1, LoopBB);
4281 for (ReductionInfo RedInfo : ReductionInfos) {
4282 Value *ReductionVal = RedInfo.PrivateVariable;
4283 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4284 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4285 Type *DestTy = RedInfo.ElementType;
4286 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4287 Value *LHSPtr =
4288 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4289 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4290 Value *RHSPtr =
4291 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4292 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4293 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4295 InsertPointOrErrorTy AfterIP =
4296 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4297 if (!AfterIP)
4298 return AfterIP.takeError();
4299 Builder.CreateStore(Result, LHSPtr);
4300 }
4301 llvm::Value *NextIVal = Builder.CreateNUWSub(
4302 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4303 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4304 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4305 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4306 emitBlock(InnerExitBB, CurFn);
4307 llvm::Value *Next = Builder.CreateNUWAdd(
4308 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4309 Counter->addIncoming(Next, Builder.GetInsertBlock());
4310 // pow2k <<= 1;
4311 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4312 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4313 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4314 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4315 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4316 return Error::success();
4317 };
4318
4319 // TODO: Perform finalization actions for variables. This has to be
4320 // called for variables which have destructors/finalizers.
4321 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4322
4323 llvm::Value *FilterVal = Builder.getInt32(0);
4324 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4325 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4326
4327 if (!AfterIP)
4328 return AfterIP.takeError();
4329 Builder.restoreIP(*AfterIP);
4330 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4331
4332 if (!AfterIP)
4333 return AfterIP.takeError();
4334 Builder.restoreIP(*AfterIP);
4335 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4336 if (Err)
4337 return Err;
4338
4339 return AfterIP;
4340}
4341
4342Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4343 llvm::function_ref<Error()> InputLoopGen,
4344 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4345 ScanInfo *ScanRedInfo) {
4346
4347 {
4348 // Emit loop with input phase:
4349 // for (i: 0..<num_iters>) {
4350 // <input phase>;
4351 // buffer[i] = red;
4352 // }
4353 ScanRedInfo->OMPFirstScanLoop = true;
4354 Error Err = InputLoopGen();
4355 if (Err)
4356 return Err;
4357 }
4358 {
4359 // Emit loop with scan phase:
4360 // for (i: 0..<num_iters>) {
4361 // red = buffer[i];
4362 // <scan phase>;
4363 // }
4364 ScanRedInfo->OMPFirstScanLoop = false;
4365 Error Err = ScanLoopGen(Builder.saveIP());
4366 if (Err)
4367 return Err;
4368 }
4369 return Error::success();
4370}
4371
4372void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4373 Function *Fun = Builder.GetInsertBlock()->getParent();
4374 ScanRedInfo->OMPScanDispatch =
4375 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4376 ScanRedInfo->OMPAfterScanBlock =
4377 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4378 ScanRedInfo->OMPBeforeScanBlock =
4379 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4380 ScanRedInfo->OMPScanLoopExit =
4381 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4382}
4383CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4384 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4385 BasicBlock *PostInsertBefore, const Twine &Name) {
4386 Module *M = F->getParent();
4387 LLVMContext &Ctx = M->getContext();
4388 Type *IndVarTy = TripCount->getType();
4389
4390 // Create the basic block structure.
4391 BasicBlock *Preheader =
4392 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4393 BasicBlock *Header =
4394 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4395 BasicBlock *Cond =
4396 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4397 BasicBlock *Body =
4398 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4399 BasicBlock *Latch =
4400 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4401 BasicBlock *Exit =
4402 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4403 BasicBlock *After =
4404 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4405
4406 // Use specified DebugLoc for new instructions.
4407 Builder.SetCurrentDebugLocation(DL);
4408
4409 Builder.SetInsertPoint(Preheader);
4410 Builder.CreateBr(Header);
4411
4412 Builder.SetInsertPoint(Header);
4413 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4414 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4415 Builder.CreateBr(Cond);
4416
4417 Builder.SetInsertPoint(Cond);
4418 Value *Cmp =
4419 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4420 Builder.CreateCondBr(Cmp, Body, Exit);
4421
4422 Builder.SetInsertPoint(Body);
4423 Builder.CreateBr(Latch);
4424
4425 Builder.SetInsertPoint(Latch);
4426 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4427 "omp_" + Name + ".next", /*HasNUW=*/true);
4428 Builder.CreateBr(Header);
4429 IndVarPHI->addIncoming(Next, Latch);
4430
4431 Builder.SetInsertPoint(Exit);
4432 Builder.CreateBr(After);
4433
4434 // Remember and return the canonical control flow.
4435 LoopInfos.emplace_front();
4436 CanonicalLoopInfo *CL = &LoopInfos.front();
4437
4438 CL->Header = Header;
4439 CL->Cond = Cond;
4440 CL->Latch = Latch;
4441 CL->Exit = Exit;
4442
4443#ifndef NDEBUG
4444 CL->assertOK();
4445#endif
4446 return CL;
4447}
4448
4450OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4451 LoopBodyGenCallbackTy BodyGenCB,
4452 Value *TripCount, const Twine &Name) {
4453 BasicBlock *BB = Loc.IP.getBlock();
4454 BasicBlock *NextBB = BB->getNextNode();
4455
4456 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4457 NextBB, NextBB, Name);
4458 BasicBlock *After = CL->getAfter();
4459
4460 // If location is not set, don't connect the loop.
4461 if (updateToLocation(Loc)) {
4462 // Split the loop at the insertion point: Branch to the preheader and move
4463 // every following instruction to after the loop (the After BB). Also, the
4464 // new successor is the loop's after block.
4465 spliceBB(Builder, After, /*CreateBranch=*/false);
4466 Builder.CreateBr(CL->getPreheader());
4467 }
4468
4469 // Emit the body content. We do it after connecting the loop to the CFG to
4470 // avoid that the callback encounters degenerate BBs.
4471 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4472 return Err;
4473
4474#ifndef NDEBUG
4475 CL->assertOK();
4476#endif
4477 return CL;
4478}
4479
4480Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4481 ScanInfos.emplace_front();
4482 ScanInfo *Result = &ScanInfos.front();
4483 return Result;
4484}
4485
4487OpenMPIRBuilder::createCanonicalScanLoops(
4488 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4489 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4490 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4491 LocationDescription ComputeLoc =
4492 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4493 updateToLocation(ComputeLoc);
4494
4496
4497 Value *TripCount = calculateCanonicalLoopTripCount(
4498 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4499 ScanRedInfo->Span = TripCount;
4500 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4501 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4502
4503 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4504 Builder.restoreIP(CodeGenIP);
4505 ScanRedInfo->IV = IV;
4506 createScanBBs(ScanRedInfo);
4507 BasicBlock *InputBlock = Builder.GetInsertBlock();
4508 Instruction *Terminator = InputBlock->getTerminator();
4509 assert(Terminator->getNumSuccessors() == 1);
4510 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4511 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4512 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4513 Builder.GetInsertBlock()->getParent());
4514 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4515 emitBlock(ScanRedInfo->OMPScanLoopExit,
4516 Builder.GetInsertBlock()->getParent());
4517 Builder.CreateBr(ContinueBlock);
4518 Builder.SetInsertPoint(
4519 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4520 return BodyGenCB(Builder.saveIP(), IV);
4521 };
4522
4523 const auto &&InputLoopGen = [&]() -> Error {
4524 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4525 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4526 ComputeIP, Name, true, ScanRedInfo);
4527 if (!LoopInfo)
4528 return LoopInfo.takeError();
4529 Result.push_back(*LoopInfo);
4530 Builder.restoreIP((*LoopInfo)->getAfterIP());
4531 return Error::success();
4532 };
4533 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4535 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4536 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4537 if (!LoopInfo)
4538 return LoopInfo.takeError();
4539 Result.push_back(*LoopInfo);
4540 Builder.restoreIP((*LoopInfo)->getAfterIP());
4541 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4542 return Error::success();
4543 };
4544 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4545 if (Err)
4546 return Err;
4547 return Result;
4548}
4549
4550Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4551 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4552 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4553
4554 // Consider the following difficulties (assuming 8-bit signed integers):
4555 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4556 // DO I = 1, 100, 50
4557 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4558 // DO I = 100, 0, -128
4559
4560 // Start, Stop and Step must be of the same integer type.
4561 auto *IndVarTy = cast<IntegerType>(Start->getType());
4562 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4563 assert(IndVarTy == Step->getType() && "Step type mismatch");
4564
4565 updateToLocation(Loc);
4566
4567 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4568 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4569
4570 // Like Step, but always positive.
4571 Value *Incr = Step;
4572
4573 // Distance between Start and Stop; always positive.
4574 Value *Span;
4575
4576 // Condition whether there are no iterations are executed at all, e.g. because
4577 // UB < LB.
4578 Value *ZeroCmp;
4579
4580 if (IsSigned) {
4581 // Ensure that increment is positive. If not, negate and invert LB and UB.
4582 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4583 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4584 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4585 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4586 Span = Builder.CreateSub(UB, LB, "", false, true);
4587 ZeroCmp = Builder.CreateICmp(
4588 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4589 } else {
4590 Span = Builder.CreateSub(Stop, Start, "", true);
4591 ZeroCmp = Builder.CreateICmp(
4592 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4593 }
4594
4595 Value *CountIfLooping;
4596 if (InclusiveStop) {
4597 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4598 } else {
4599 // Avoid incrementing past stop since it could overflow.
4600 Value *CountIfTwo = Builder.CreateAdd(
4601 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4602 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4603 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4604 }
4605
4606 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4607 "omp_" + Name + ".tripcount");
4608}
4609
4610Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4611 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4612 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4613 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4614 ScanInfo *ScanRedInfo) {
4615 LocationDescription ComputeLoc =
4616 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4617
4618 Value *TripCount = calculateCanonicalLoopTripCount(
4619 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4620
4621 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4622 Builder.restoreIP(CodeGenIP);
4623 Value *Span = Builder.CreateMul(IV, Step);
4624 Value *IndVar = Builder.CreateAdd(Span, Start);
4625 if (InScan)
4626 ScanRedInfo->IV = IndVar;
4627 return BodyGenCB(Builder.saveIP(), IndVar);
4628 };
4629 LocationDescription LoopLoc =
4630 ComputeIP.isSet()
4631 ? Loc
4632 : LocationDescription(Builder.saveIP(),
4633 Builder.getCurrentDebugLocation());
4634 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4635}
4636
4637// Returns an LLVM function to call for initializing loop bounds using OpenMP
4638// static scheduling for composite `distribute parallel for` depending on
4639// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4640// integers as unsigned similarly to CanonicalLoopInfo.
4641static FunctionCallee
4643 OpenMPIRBuilder &OMPBuilder) {
4644 unsigned Bitwidth = Ty->getIntegerBitWidth();
4645 if (Bitwidth == 32)
4646 return OMPBuilder.getOrCreateRuntimeFunction(
4647 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4648 if (Bitwidth == 64)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4651 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4652}
4653
4654// Returns an LLVM function to call for initializing loop bounds using OpenMP
4655// static scheduling depending on `type`. Only i32 and i64 are supported by the
4656// runtime. Always interpret integers as unsigned similarly to
4657// CanonicalLoopInfo.
4659 OpenMPIRBuilder &OMPBuilder) {
4660 unsigned Bitwidth = Ty->getIntegerBitWidth();
4661 if (Bitwidth == 32)
4662 return OMPBuilder.getOrCreateRuntimeFunction(
4663 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4664 if (Bitwidth == 64)
4665 return OMPBuilder.getOrCreateRuntimeFunction(
4666 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4667 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4668}
4669
4670OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4671 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4672 WorksharingLoopType LoopType, bool NeedsBarrier) {
4673 assert(CLI->isValid() && "Requires a valid canonical loop");
4674 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4675 "Require dedicated allocate IP");
4676
4677 // Set up the source location value for OpenMP runtime.
4678 Builder.restoreIP(CLI->getPreheaderIP());
4679 Builder.SetCurrentDebugLocation(DL);
4680
4681 uint32_t SrcLocStrSize;
4682 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4683 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4684
4685 // Declare useful OpenMP runtime functions.
4686 Value *IV = CLI->getIndVar();
4687 Type *IVTy = IV->getType();
4688 FunctionCallee StaticInit =
4689 LoopType == WorksharingLoopType::DistributeForStaticLoop
4690 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4691 : getKmpcForStaticInitForType(IVTy, M, *this);
4692 FunctionCallee StaticFini =
4693 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4694
4695 // Allocate space for computed loop bounds as expected by the "init" function.
4696 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4697
4698 Type *I32Type = Type::getInt32Ty(M.getContext());
4699 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4700 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4701 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4702 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4703 CLI->setLastIter(PLastIter);
4704
4705 // At the end of the preheader, prepare for calling the "init" function by
4706 // storing the current loop bounds into the allocated space. A canonical loop
4707 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4708 // and produces an inclusive upper bound.
4709 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4710 Constant *Zero = ConstantInt::get(IVTy, 0);
4711 Constant *One = ConstantInt::get(IVTy, 1);
4712 Builder.CreateStore(Zero, PLowerBound);
4713 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4714 Builder.CreateStore(UpperBound, PUpperBound);
4715 Builder.CreateStore(One, PStride);
4716
4717 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4718
4719 OMPScheduleType SchedType =
4720 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4721 ? OMPScheduleType::OrderedDistribute
4723 Constant *SchedulingType =
4724 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4725
4726 // Call the "init" function and update the trip count of the loop with the
4727 // value it produced.
4729 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4730 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4731 Value *PDistUpperBound =
4732 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4733 Args.push_back(PDistUpperBound);
4734 }
4735 Args.append({PStride, One, Zero});
4736 Builder.CreateCall(StaticInit, Args);
4737 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4738 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4739 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4740 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4741 CLI->setTripCount(TripCount);
4742
4743 // Update all uses of the induction variable except the one in the condition
4744 // block that compares it with the actual upper bound, and the increment in
4745 // the latch block.
4746
4747 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4748 Builder.SetInsertPoint(CLI->getBody(),
4749 CLI->getBody()->getFirstInsertionPt());
4750 Builder.SetCurrentDebugLocation(DL);
4751 return Builder.CreateAdd(OldIV, LowerBound);
4752 });
4753
4754 // In the "exit" block, call the "fini" function.
4755 Builder.SetInsertPoint(CLI->getExit(),
4756 CLI->getExit()->getTerminator()->getIterator());
4757 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4758
4759 // Add the barrier if requested.
4760 if (NeedsBarrier) {
4761 InsertPointOrErrorTy BarrierIP =
4762 createBarrier(LocationDescription(Builder.saveIP(), DL),
4763 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4764 /* CheckCancelFlag */ false);
4765 if (!BarrierIP)
4766 return BarrierIP.takeError();
4767 }
4768
4769 InsertPointTy AfterIP = CLI->getAfterIP();
4770 CLI->invalidate();
4771
4772 return AfterIP;
4773}
4774
4775OpenMPIRBuilder::InsertPointOrErrorTy
4776OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4777 CanonicalLoopInfo *CLI,
4778 InsertPointTy AllocaIP,
4779 bool NeedsBarrier,
4780 Value *ChunkSize) {
4781 assert(CLI->isValid() && "Requires a valid canonical loop");
4782 assert(ChunkSize && "Chunk size is required");
4783
4784 LLVMContext &Ctx = CLI->getFunction()->getContext();
4785 Value *IV = CLI->getIndVar();
4786 Value *OrigTripCount = CLI->getTripCount();
4787 Type *IVTy = IV->getType();
4788 assert(IVTy->getIntegerBitWidth() <= 64 &&
4789 "Max supported tripcount bitwidth is 64 bits");
4790 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4791 : Type::getInt64Ty(Ctx);
4792 Type *I32Type = Type::getInt32Ty(M.getContext());
4793 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4794 Constant *One = ConstantInt::get(InternalIVTy, 1);
4795
4796 // Declare useful OpenMP runtime functions.
4797 FunctionCallee StaticInit =
4798 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4799 FunctionCallee StaticFini =
4800 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4801
4802 // Allocate space for computed loop bounds as expected by the "init" function.
4803 Builder.restoreIP(AllocaIP);
4804 Builder.SetCurrentDebugLocation(DL);
4805 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4806 Value *PLowerBound =
4807 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4808 Value *PUpperBound =
4809 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4810 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4811 CLI->setLastIter(PLastIter);
4812
4813 // Set up the source location value for the OpenMP runtime.
4814 Builder.restoreIP(CLI->getPreheaderIP());
4815 Builder.SetCurrentDebugLocation(DL);
4816
4817 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4818 Value *CastedChunkSize =
4819 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4820 Value *CastedTripCount =
4821 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4822
4823 Constant *SchedulingType = ConstantInt::get(
4824 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4825 Builder.CreateStore(Zero, PLowerBound);
4826 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4827 Builder.CreateStore(OrigUpperBound, PUpperBound);
4828 Builder.CreateStore(One, PStride);
4829
4830 // Call the "init" function and update the trip count of the loop with the
4831 // value it produced.
4832 uint32_t SrcLocStrSize;
4833 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4834 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4835 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4836 Builder.CreateCall(StaticInit,
4837 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4838 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4839 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4840 /*pstride=*/PStride, /*incr=*/One,
4841 /*chunk=*/CastedChunkSize});
4842
4843 // Load values written by the "init" function.
4844 Value *FirstChunkStart =
4845 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4846 Value *FirstChunkStop =
4847 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4848 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4849 Value *ChunkRange =
4850 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4851 Value *NextChunkStride =
4852 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4853
4854 // Create outer "dispatch" loop for enumerating the chunks.
4855 BasicBlock *DispatchEnter = splitBB(Builder, true);
4856 Value *DispatchCounter;
4857
4858 // It is safe to assume this didn't return an error because the callback
4859 // passed into createCanonicalLoop is the only possible error source, and it
4860 // always returns success.
4861 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4862 {Builder.saveIP(), DL},
4863 [&](InsertPointTy BodyIP, Value *Counter) {
4864 DispatchCounter = Counter;
4865 return Error::success();
4866 },
4867 FirstChunkStart, CastedTripCount, NextChunkStride,
4868 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4869 "dispatch"));
4870
4871 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4872 // not have to preserve the canonical invariant.
4873 BasicBlock *DispatchBody = DispatchCLI->getBody();
4874 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4875 BasicBlock *DispatchExit = DispatchCLI->getExit();
4876 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4877 DispatchCLI->invalidate();
4878
4879 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4880 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4881 redirectTo(CLI->getExit(), DispatchLatch, DL);
4882 redirectTo(DispatchBody, DispatchEnter, DL);
4883
4884 // Prepare the prolog of the chunk loop.
4885 Builder.restoreIP(CLI->getPreheaderIP());
4886 Builder.SetCurrentDebugLocation(DL);
4887
4888 // Compute the number of iterations of the chunk loop.
4889 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4890 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4891 Value *IsLastChunk =
4892 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4893 Value *CountUntilOrigTripCount =
4894 Builder.CreateSub(CastedTripCount, DispatchCounter);
4895 Value *ChunkTripCount = Builder.CreateSelect(
4896 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4897 Value *BackcastedChunkTC =
4898 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4899 CLI->setTripCount(BackcastedChunkTC);
4900
4901 // Update all uses of the induction variable except the one in the condition
4902 // block that compares it with the actual upper bound, and the increment in
4903 // the latch block.
4904 Value *BackcastedDispatchCounter =
4905 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4906 CLI->mapIndVar([&](Instruction *) -> Value * {
4907 Builder.restoreIP(CLI->getBodyIP());
4908 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4909 });
4910
4911 // In the "exit" block, call the "fini" function.
4912 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4913 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4914
4915 // Add the barrier if requested.
4916 if (NeedsBarrier) {
4917 InsertPointOrErrorTy AfterIP =
4918 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4919 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4920 if (!AfterIP)
4921 return AfterIP.takeError();
4922 }
4923
4924#ifndef NDEBUG
4925 // Even though we currently do not support applying additional methods to it,
4926 // the chunk loop should remain a canonical loop.
4927 CLI->assertOK();
4928#endif
4929
4930 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4931}
4932
4933// Returns an LLVM function to call for executing an OpenMP static worksharing
4934// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4935// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4936static FunctionCallee
4937getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4938 WorksharingLoopType LoopType) {
4939 unsigned Bitwidth = Ty->getIntegerBitWidth();
4940 Module &M = OMPBuilder->M;
4941 switch (LoopType) {
4942 case WorksharingLoopType::ForStaticLoop:
4943 if (Bitwidth == 32)
4944 return OMPBuilder->getOrCreateRuntimeFunction(
4945 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4946 if (Bitwidth == 64)
4947 return OMPBuilder->getOrCreateRuntimeFunction(
4948 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4949 break;
4950 case WorksharingLoopType::DistributeStaticLoop:
4951 if (Bitwidth == 32)
4952 return OMPBuilder->getOrCreateRuntimeFunction(
4953 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4954 if (Bitwidth == 64)
4955 return OMPBuilder->getOrCreateRuntimeFunction(
4956 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4957 break;
4958 case WorksharingLoopType::DistributeForStaticLoop:
4959 if (Bitwidth == 32)
4960 return OMPBuilder->getOrCreateRuntimeFunction(
4961 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4962 if (Bitwidth == 64)
4963 return OMPBuilder->getOrCreateRuntimeFunction(
4964 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4965 break;
4966 }
4967 if (Bitwidth != 32 && Bitwidth != 64) {
4968 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4969 }
4970 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4971}
4972
4973// Inserts a call to proper OpenMP Device RTL function which handles
4974// loop worksharing.
4975static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4976 WorksharingLoopType LoopType,
4977 BasicBlock *InsertBlock, Value *Ident,
4978 Value *LoopBodyArg, Value *TripCount,
4979 Function &LoopBodyFn) {
4980 Type *TripCountTy = TripCount->getType();
4981 Module &M = OMPBuilder->M;
4982 IRBuilder<> &Builder = OMPBuilder->Builder;
4983 FunctionCallee RTLFn =
4984 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4985 SmallVector<Value *, 8> RealArgs;
4986 RealArgs.push_back(Ident);
4987 RealArgs.push_back(&LoopBodyFn);
4988 RealArgs.push_back(LoopBodyArg);
4989 RealArgs.push_back(TripCount);
4990 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4991 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4992 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4993 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4994 Builder.CreateCall(RTLFn, RealArgs);
4995 return;
4996 }
4997 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4998 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4999 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5000 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5001
5002 RealArgs.push_back(
5003 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5004 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5005 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5006 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5007 }
5008 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5009
5010 Builder.CreateCall(RTLFn, RealArgs);
5011}
5012
5014 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5015 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5016 WorksharingLoopType LoopType) {
5017 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5018 BasicBlock *Preheader = CLI->getPreheader();
5019 Value *TripCount = CLI->getTripCount();
5020
5021 // After loop body outling, the loop body contains only set up
5022 // of loop body argument structure and the call to the outlined
5023 // loop body function. Firstly, we need to move setup of loop body args
5024 // into loop preheader.
5025 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5026 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5027
5028 // The next step is to remove the whole loop. We do not it need anymore.
5029 // That's why make an unconditional branch from loop preheader to loop
5030 // exit block
5031 Builder.restoreIP({Preheader, Preheader->end()});
5032 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5033 Preheader->getTerminator()->eraseFromParent();
5034 Builder.CreateBr(CLI->getExit());
5035
5036 // Delete dead loop blocks
5037 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5038 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5039 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5040 CleanUpInfo.EntryBB = CLI->getHeader();
5041 CleanUpInfo.ExitBB = CLI->getExit();
5042 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5043 DeleteDeadBlocks(BlocksToBeRemoved);
5044
5045 // Find the instruction which corresponds to loop body argument structure
5046 // and remove the call to loop body function instruction.
5047 Value *LoopBodyArg;
5048 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5049 assert(OutlinedFnUser &&
5050 "Expected unique undroppable user of outlined function");
5051 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5052 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5053 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5054 "Expected outlined function call to be located in loop preheader");
5055 // Check in case no argument structure has been passed.
5056 if (OutlinedFnCallInstruction->arg_size() > 1)
5057 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5058 else
5059 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5060 OutlinedFnCallInstruction->eraseFromParent();
5061
5062 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5063 LoopBodyArg, TripCount, OutlinedFn);
5064
5065 for (auto &ToBeDeletedItem : ToBeDeleted)
5066 ToBeDeletedItem->eraseFromParent();
5067 CLI->invalidate();
5068}
5069
5070OpenMPIRBuilder::InsertPointTy
5071OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5072 InsertPointTy AllocaIP,
5073 WorksharingLoopType LoopType) {
5074 uint32_t SrcLocStrSize;
5075 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5076 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5077
5078 OutlineInfo OI;
5079 OI.OuterAllocaBB = CLI->getPreheader();
5080 Function *OuterFn = CLI->getPreheader()->getParent();
5081
5082 // Instructions which need to be deleted at the end of code generation
5084
5085 OI.OuterAllocaBB = AllocaIP.getBlock();
5086
5087 // Mark the body loop as region which needs to be extracted
5088 OI.EntryBB = CLI->getBody();
5089 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5090 "omp.prelatch", true);
5091
5092 // Prepare loop body for extraction
5093 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5094
5095 // Insert new loop counter variable which will be used only in loop
5096 // body.
5097 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5098 Instruction *NewLoopCntLoad =
5099 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5100 // New loop counter instructions are redundant in the loop preheader when
5101 // code generation for workshare loop is finshed. That's why mark them as
5102 // ready for deletion.
5103 ToBeDeleted.push_back(NewLoopCntLoad);
5104 ToBeDeleted.push_back(NewLoopCnt);
5105
5106 // Analyse loop body region. Find all input variables which are used inside
5107 // loop body region.
5108 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5110 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5111
5112 CodeExtractorAnalysisCache CEAC(*OuterFn);
5113 CodeExtractor Extractor(Blocks,
5114 /* DominatorTree */ nullptr,
5115 /* AggregateArgs */ true,
5116 /* BlockFrequencyInfo */ nullptr,
5117 /* BranchProbabilityInfo */ nullptr,
5118 /* AssumptionCache */ nullptr,
5119 /* AllowVarArgs */ true,
5120 /* AllowAlloca */ true,
5121 /* AllocationBlock */ CLI->getPreheader(),
5122 /* Suffix */ ".omp_wsloop",
5123 /* AggrArgsIn0AddrSpace */ true);
5124
5125 BasicBlock *CommonExit = nullptr;
5126 SetVector<Value *> SinkingCands, HoistingCands;
5127
5128 // Find allocas outside the loop body region which are used inside loop
5129 // body
5130 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5131
5132 // We need to model loop body region as the function f(cnt, loop_arg).
5133 // That's why we replace loop induction variable by the new counter
5134 // which will be one of loop body function argument
5135 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5136 CLI->getIndVar()->user_end());
5137 for (auto Use : Users) {
5138 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5139 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5140 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5141 }
5142 }
5143 }
5144 // Make sure that loop counter variable is not merged into loop body
5145 // function argument structure and it is passed as separate variable
5146 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5147
5148 // PostOutline CB is invoked when loop body function is outlined and
5149 // loop body is replaced by call to outlined function. We need to add
5150 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5151 // function will handle loop control logic.
5152 //
5153 OI.PostOutlineCB = [=, ToBeDeletedVec =
5154 std::move(ToBeDeleted)](Function &OutlinedFn) {
5155 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5156 LoopType);
5157 };
5158 addOutlineInfo(std::move(OI));
5159 return CLI->getAfterIP();
5160}
5161
5162OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5163 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5164 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5165 bool HasSimdModifier, bool HasMonotonicModifier,
5166 bool HasNonmonotonicModifier, bool HasOrderedClause,
5167 WorksharingLoopType LoopType) {
5168 if (Config.isTargetDevice())
5169 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5170 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5171 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5172 HasNonmonotonicModifier, HasOrderedClause);
5173
5174 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5175 OMPScheduleType::ModifierOrdered;
5176 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5177 case OMPScheduleType::BaseStatic:
5178 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5179 if (IsOrdered)
5180 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5181 NeedsBarrier, ChunkSize);
5182 // FIXME: Monotonicity ignored?
5183 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5184
5185 case OMPScheduleType::BaseStaticChunked:
5186 if (IsOrdered)
5187 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5188 NeedsBarrier, ChunkSize);
5189 // FIXME: Monotonicity ignored?
5190 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5191 ChunkSize);
5192
5193 case OMPScheduleType::BaseRuntime:
5194 case OMPScheduleType::BaseAuto:
5195 case OMPScheduleType::BaseGreedy:
5196 case OMPScheduleType::BaseBalanced:
5197 case OMPScheduleType::BaseSteal:
5198 case OMPScheduleType::BaseGuidedSimd:
5199 case OMPScheduleType::BaseRuntimeSimd:
5200 assert(!ChunkSize &&
5201 "schedule type does not support user-defined chunk sizes");
5202 [[fallthrough]];
5203 case OMPScheduleType::BaseDynamicChunked:
5204 case OMPScheduleType::BaseGuidedChunked:
5205 case OMPScheduleType::BaseGuidedIterativeChunked:
5206 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5207 case OMPScheduleType::BaseStaticBalancedChunked:
5208 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5209 NeedsBarrier, ChunkSize);
5210
5211 default:
5212 llvm_unreachable("Unknown/unimplemented schedule kind");
5213 }
5214}
5215
5216/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5217/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5218/// the runtime. Always interpret integers as unsigned similarly to
5219/// CanonicalLoopInfo.
5220static FunctionCallee
5221getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5222 unsigned Bitwidth = Ty->getIntegerBitWidth();
5223 if (Bitwidth == 32)
5224 return OMPBuilder.getOrCreateRuntimeFunction(
5225 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5226 if (Bitwidth == 64)
5227 return OMPBuilder.getOrCreateRuntimeFunction(
5228 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5229 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5230}
5231
5232/// Returns an LLVM function to call for updating the next loop using OpenMP
5233/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5234/// the runtime. Always interpret integers as unsigned similarly to
5235/// CanonicalLoopInfo.
5236static FunctionCallee
5237getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5238 unsigned Bitwidth = Ty->getIntegerBitWidth();
5239 if (Bitwidth == 32)
5240 return OMPBuilder.getOrCreateRuntimeFunction(
5241 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5242 if (Bitwidth == 64)
5243 return OMPBuilder.getOrCreateRuntimeFunction(
5244 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5245 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5246}
5247
5248/// Returns an LLVM function to call for finalizing the dynamic loop using
5249/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5250/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5251static FunctionCallee
5252getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5253 unsigned Bitwidth = Ty->getIntegerBitWidth();
5254 if (Bitwidth == 32)
5255 return OMPBuilder.getOrCreateRuntimeFunction(
5256 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5257 if (Bitwidth == 64)
5258 return OMPBuilder.getOrCreateRuntimeFunction(
5259 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5260 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5261}
5262
5263OpenMPIRBuilder::InsertPointOrErrorTy
5264OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5265 InsertPointTy AllocaIP,
5266 OMPScheduleType SchedType,
5267 bool NeedsBarrier, Value *Chunk) {
5268 assert(CLI->isValid() && "Requires a valid canonical loop");
5269 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5270 "Require dedicated allocate IP");
5272 "Require valid schedule type");
5273
5274 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5275 OMPScheduleType::ModifierOrdered;
5276
5277 // Set up the source location value for OpenMP runtime.
5278 Builder.SetCurrentDebugLocation(DL);
5279
5280 uint32_t SrcLocStrSize;
5281 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5282 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5283
5284 // Declare useful OpenMP runtime functions.
5285 Value *IV = CLI->getIndVar();
5286 Type *IVTy = IV->getType();
5287 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5288 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5289
5290 // Allocate space for computed loop bounds as expected by the "init" function.
5291 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5292 Type *I32Type = Type::getInt32Ty(M.getContext());
5293 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5294 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5295 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5296 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5297 CLI->setLastIter(PLastIter);
5298
5299 // At the end of the preheader, prepare for calling the "init" function by
5300 // storing the current loop bounds into the allocated space. A canonical loop
5301 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5302 // and produces an inclusive upper bound.
5303 BasicBlock *PreHeader = CLI->getPreheader();
5304 Builder.SetInsertPoint(PreHeader->getTerminator());
5305 Constant *One = ConstantInt::get(IVTy, 1);
5306 Builder.CreateStore(One, PLowerBound);
5307 Value *UpperBound = CLI->getTripCount();
5308 Builder.CreateStore(UpperBound, PUpperBound);
5309 Builder.CreateStore(One, PStride);
5310
5311 BasicBlock *Header = CLI->getHeader();
5312 BasicBlock *Exit = CLI->getExit();
5313 BasicBlock *Cond = CLI->getCond();
5314 BasicBlock *Latch = CLI->getLatch();
5315 InsertPointTy AfterIP = CLI->getAfterIP();
5316
5317 // The CLI will be "broken" in the code below, as the loop is no longer
5318 // a valid canonical loop.
5319
5320 if (!Chunk)
5321 Chunk = One;
5322
5323 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5324
5325 Constant *SchedulingType =
5326 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5327
5328 // Call the "init" function.
5329 Builder.CreateCall(DynamicInit,
5330 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5331 UpperBound, /* step */ One, Chunk});
5332
5333 // An outer loop around the existing one.
5334 BasicBlock *OuterCond = BasicBlock::Create(
5335 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5336 PreHeader->getParent());
5337 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5338 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5339 Value *Res =
5340 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5341 PLowerBound, PUpperBound, PStride});
5342 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5343 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5344 Value *LowerBound =
5345 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5346 Builder.CreateCondBr(MoreWork, Header, Exit);
5347
5348 // Change PHI-node in loop header to use outer cond rather than preheader,
5349 // and set IV to the LowerBound.
5350 Instruction *Phi = &Header->front();
5351 auto *PI = cast<PHINode>(Phi);
5352 PI->setIncomingBlock(0, OuterCond);
5353 PI->setIncomingValue(0, LowerBound);
5354
5355 // Then set the pre-header to jump to the OuterCond
5356 Instruction *Term = PreHeader->getTerminator();
5357 auto *Br = cast<BranchInst>(Term);
5358 Br->setSuccessor(0, OuterCond);
5359
5360 // Modify the inner condition:
5361 // * Use the UpperBound returned from the DynamicNext call.
5362 // * jump to the loop outer loop when done with one of the inner loops.
5363 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5364 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5365 Instruction *Comp = &*Builder.GetInsertPoint();
5366 auto *CI = cast<CmpInst>(Comp);
5367 CI->setOperand(1, UpperBound);
5368 // Redirect the inner exit to branch to outer condition.
5369 Instruction *Branch = &Cond->back();
5370 auto *BI = cast<BranchInst>(Branch);
5371 assert(BI->getSuccessor(1) == Exit);
5372 BI->setSuccessor(1, OuterCond);
5373
5374 // Call the "fini" function if "ordered" is present in wsloop directive.
5375 if (Ordered) {
5376 Builder.SetInsertPoint(&Latch->back());
5377 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5378 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5379 }
5380
5381 // Add the barrier if requested.
5382 if (NeedsBarrier) {
5383 Builder.SetInsertPoint(&Exit->back());
5384 InsertPointOrErrorTy BarrierIP =
5385 createBarrier(LocationDescription(Builder.saveIP(), DL),
5386 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5387 /* CheckCancelFlag */ false);
5388 if (!BarrierIP)
5389 return BarrierIP.takeError();
5390 }
5391
5392 CLI->invalidate();
5393 return AfterIP;
5394}
5395
5396/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5397/// after this \p OldTarget will be orphaned.
5399 BasicBlock *NewTarget, DebugLoc DL) {
5400 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5401 redirectTo(Pred, NewTarget, DL);
5402}
5403
5404/// Determine which blocks in \p BBs are reachable from outside and remove the
5405/// ones that are not reachable from the function.
5408 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5409 for (Use &U : BB->uses()) {
5410 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5411 if (!UseInst)
5412 continue;
5413 if (BBsToErase.count(UseInst->getParent()))
5414 continue;
5415 return true;
5416 }
5417 return false;
5418 };
5419
5420 while (BBsToErase.remove_if(HasRemainingUses)) {
5421 // Try again if anything was removed.
5422 }
5423
5424 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5425 DeleteDeadBlocks(BBVec);
5426}
5427
5428CanonicalLoopInfo *
5429OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5430 InsertPointTy ComputeIP) {
5431 assert(Loops.size() >= 1 && "At least one loop required");
5432 size_t NumLoops = Loops.size();
5433
5434 // Nothing to do if there is already just one loop.
5435 if (NumLoops == 1)
5436 return Loops.front();
5437
5438 CanonicalLoopInfo *Outermost = Loops.front();
5439 CanonicalLoopInfo *Innermost = Loops.back();
5440 BasicBlock *OrigPreheader = Outermost->getPreheader();
5441 BasicBlock *OrigAfter = Outermost->getAfter();
5442 Function *F = OrigPreheader->getParent();
5443
5444 // Loop control blocks that may become orphaned later.
5445 SmallVector<BasicBlock *, 12> OldControlBBs;
5446 OldControlBBs.reserve(6 * Loops.size());
5447 for (CanonicalLoopInfo *Loop : Loops)
5448 Loop->collectControlBlocks(OldControlBBs);
5449
5450 // Setup the IRBuilder for inserting the trip count computation.
5451 Builder.SetCurrentDebugLocation(DL);
5452 if (ComputeIP.isSet())
5453 Builder.restoreIP(ComputeIP);
5454 else
5455 Builder.restoreIP(Outermost->getPreheaderIP());
5456
5457 // Derive the collapsed' loop trip count.
5458 // TODO: Find common/largest indvar type.
5459 Value *CollapsedTripCount = nullptr;
5460 for (CanonicalLoopInfo *L : Loops) {
5461 assert(L->isValid() &&
5462 "All loops to collapse must be valid canonical loops");
5463 Value *OrigTripCount = L->getTripCount();
5464 if (!CollapsedTripCount) {
5465 CollapsedTripCount = OrigTripCount;
5466 continue;
5467 }
5468
5469 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5470 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5471 }
5472
5473 // Create the collapsed loop control flow.
5474 CanonicalLoopInfo *Result =
5475 createLoopSkeleton(DL, CollapsedTripCount, F,
5476 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5477
5478 // Build the collapsed loop body code.
5479 // Start with deriving the input loop induction variables from the collapsed
5480 // one, using a divmod scheme. To preserve the original loops' order, the
5481 // innermost loop use the least significant bits.
5482 Builder.restoreIP(Result->getBodyIP());
5483
5484 Value *Leftover = Result->getIndVar();
5485 SmallVector<Value *> NewIndVars;
5486 NewIndVars.resize(NumLoops);
5487 for (int i = NumLoops - 1; i >= 1; --i) {
5488 Value *OrigTripCount = Loops[i]->getTripCount();
5489
5490 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5491 NewIndVars[i] = NewIndVar;
5492
5493 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5494 }
5495 // Outermost loop gets all the remaining bits.
5496 NewIndVars[0] = Leftover;
5497
5498 // Construct the loop body control flow.
5499 // We progressively construct the branch structure following in direction of
5500 // the control flow, from the leading in-between code, the loop nest body, the
5501 // trailing in-between code, and rejoining the collapsed loop's latch.
5502 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5503 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5504 // its predecessors as sources.
5505 BasicBlock *ContinueBlock = Result->getBody();
5506 BasicBlock *ContinuePred = nullptr;
5507 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5508 BasicBlock *NextSrc) {
5509 if (ContinueBlock)
5510 redirectTo(ContinueBlock, Dest, DL);
5511 else
5512 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5513
5514 ContinueBlock = nullptr;
5515 ContinuePred = NextSrc;
5516 };
5517
5518 // The code before the nested loop of each level.
5519 // Because we are sinking it into the nest, it will be executed more often
5520 // that the original loop. More sophisticated schemes could keep track of what
5521 // the in-between code is and instantiate it only once per thread.
5522 for (size_t i = 0; i < NumLoops - 1; ++i)
5523 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5524
5525 // Connect the loop nest body.
5526 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5527
5528 // The code after the nested loop at each level.
5529 for (size_t i = NumLoops - 1; i > 0; --i)
5530 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5531
5532 // Connect the finished loop to the collapsed loop latch.
5533 ContinueWith(Result->getLatch(), nullptr);
5534
5535 // Replace the input loops with the new collapsed loop.
5536 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5537 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5538
5539 // Replace the input loop indvars with the derived ones.
5540 for (size_t i = 0; i < NumLoops; ++i)
5541 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5542
5543 // Remove unused parts of the input loops.
5544 removeUnusedBlocksFromParent(OldControlBBs);
5545
5546 for (CanonicalLoopInfo *L : Loops)
5547 L->invalidate();
5548
5549#ifndef NDEBUG
5550 Result->assertOK();
5551#endif
5552 return Result;
5553}
5554
5555std::vector<CanonicalLoopInfo *>
5556OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5557 ArrayRef<Value *> TileSizes) {
5558 assert(TileSizes.size() == Loops.size() &&
5559 "Must pass as many tile sizes as there are loops");
5560 int NumLoops = Loops.size();
5561 assert(NumLoops >= 1 && "At least one loop to tile required");
5562
5563 CanonicalLoopInfo *OutermostLoop = Loops.front();
5564 CanonicalLoopInfo *InnermostLoop = Loops.back();
5565 Function *F = OutermostLoop->getBody()->getParent();
5566 BasicBlock *InnerEnter = InnermostLoop->getBody();
5567 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5568
5569 // Loop control blocks that may become orphaned later.
5570 SmallVector<BasicBlock *, 12> OldControlBBs;
5571 OldControlBBs.reserve(6 * Loops.size());
5572 for (CanonicalLoopInfo *Loop : Loops)
5573 Loop->collectControlBlocks(OldControlBBs);
5574
5575 // Collect original trip counts and induction variable to be accessible by
5576 // index. Also, the structure of the original loops is not preserved during
5577 // the construction of the tiled loops, so do it before we scavenge the BBs of
5578 // any original CanonicalLoopInfo.
5579 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5580 for (CanonicalLoopInfo *L : Loops) {
5581 assert(L->isValid() && "All input loops must be valid canonical loops");
5582 OrigTripCounts.push_back(L->getTripCount());
5583 OrigIndVars.push_back(L->getIndVar());
5584 }
5585
5586 // Collect the code between loop headers. These may contain SSA definitions
5587 // that are used in the loop nest body. To be usable with in the innermost
5588 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5589 // these instructions may be executed more often than before the tiling.
5590 // TODO: It would be sufficient to only sink them into body of the
5591 // corresponding tile loop.
5593 for (int i = 0; i < NumLoops - 1; ++i) {
5594 CanonicalLoopInfo *Surrounding = Loops[i];
5595 CanonicalLoopInfo *Nested = Loops[i + 1];
5596
5597 BasicBlock *EnterBB = Surrounding->getBody();
5598 BasicBlock *ExitBB = Nested->getHeader();
5599 InbetweenCode.emplace_back(EnterBB, ExitBB);
5600 }
5601
5602 // Compute the trip counts of the floor loops.
5603 Builder.SetCurrentDebugLocation(DL);
5604 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5605 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5606 for (int i = 0; i < NumLoops; ++i) {
5607 Value *TileSize = TileSizes[i];
5608 Value *OrigTripCount = OrigTripCounts[i];
5609 Type *IVType = OrigTripCount->getType();
5610
5611 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5612 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5613
5614 // 0 if tripcount divides the tilesize, 1 otherwise.
5615 // 1 means we need an additional iteration for a partial tile.
5616 //
5617 // Unfortunately we cannot just use the roundup-formula
5618 // (tripcount + tilesize - 1)/tilesize
5619 // because the summation might overflow. We do not want introduce undefined
5620 // behavior when the untiled loop nest did not.
5621 Value *FloorTripOverflow =
5622 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5623
5624 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5625 Value *FloorTripCount =
5626 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5627 "omp_floor" + Twine(i) + ".tripcount", true);
5628
5629 // Remember some values for later use.
5630 FloorCompleteCount.push_back(FloorCompleteTripCount);
5631 FloorCount.push_back(FloorTripCount);
5632 FloorRems.push_back(FloorTripRem);
5633 }
5634
5635 // Generate the new loop nest, from the outermost to the innermost.
5636 std::vector<CanonicalLoopInfo *> Result;
5637 Result.reserve(NumLoops * 2);
5638
5639 // The basic block of the surrounding loop that enters the nest generated
5640 // loop.
5641 BasicBlock *Enter = OutermostLoop->getPreheader();
5642
5643 // The basic block of the surrounding loop where the inner code should
5644 // continue.
5645 BasicBlock *Continue = OutermostLoop->getAfter();
5646
5647 // Where the next loop basic block should be inserted.
5648 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5649
5650 auto EmbeddNewLoop =
5651 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5652 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5653 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5654 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5655 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5656 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5657
5658 // Setup the position where the next embedded loop connects to this loop.
5659 Enter = EmbeddedLoop->getBody();
5660 Continue = EmbeddedLoop->getLatch();
5661 OutroInsertBefore = EmbeddedLoop->getLatch();
5662 return EmbeddedLoop;
5663 };
5664
5665 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5666 const Twine &NameBase) {
5667 for (auto P : enumerate(TripCounts)) {
5668 CanonicalLoopInfo *EmbeddedLoop =
5669 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5670 Result.push_back(EmbeddedLoop);
5671 }
5672 };
5673
5674 EmbeddNewLoops(FloorCount, "floor");
5675
5676 // Within the innermost floor loop, emit the code that computes the tile
5677 // sizes.
5678 Builder.SetInsertPoint(Enter->getTerminator());
5679 SmallVector<Value *, 4> TileCounts;
5680 for (int i = 0; i < NumLoops; ++i) {
5681 CanonicalLoopInfo *FloorLoop = Result[i];
5682 Value *TileSize = TileSizes[i];
5683
5684 Value *FloorIsEpilogue =
5685 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5686 Value *TileTripCount =
5687 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5688
5689 TileCounts.push_back(TileTripCount);
5690 }
5691
5692 // Create the tile loops.
5693 EmbeddNewLoops(TileCounts, "tile");
5694
5695 // Insert the inbetween code into the body.
5696 BasicBlock *BodyEnter = Enter;
5697 BasicBlock *BodyEntered = nullptr;
5698 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5699 BasicBlock *EnterBB = P.first;
5700 BasicBlock *ExitBB = P.second;
5701
5702 if (BodyEnter)
5703 redirectTo(BodyEnter, EnterBB, DL);
5704 else
5705 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5706
5707 BodyEnter = nullptr;
5708 BodyEntered = ExitBB;
5709 }
5710
5711 // Append the original loop nest body into the generated loop nest body.
5712 if (BodyEnter)
5713 redirectTo(BodyEnter, InnerEnter, DL);
5714 else
5715 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5717
5718 // Replace the original induction variable with an induction variable computed
5719 // from the tile and floor induction variables.
5720 Builder.restoreIP(Result.back()->getBodyIP());
5721 for (int i = 0; i < NumLoops; ++i) {
5722 CanonicalLoopInfo *FloorLoop = Result[i];
5723 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5724 Value *OrigIndVar = OrigIndVars[i];
5725 Value *Size = TileSizes[i];
5726
5727 Value *Scale =
5728 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5729 Value *Shift =
5730 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5731 OrigIndVar->replaceAllUsesWith(Shift);
5732 }
5733
5734 // Remove unused parts of the original loops.
5735 removeUnusedBlocksFromParent(OldControlBBs);
5736
5737 for (CanonicalLoopInfo *L : Loops)
5738 L->invalidate();
5739
5740#ifndef NDEBUG
5741 for (CanonicalLoopInfo *GenL : Result)
5742 GenL->assertOK();
5743#endif
5744 return Result;
5745}
5746
5747/// Attach metadata \p Properties to the basic block described by \p BB. If the
5748/// basic block already has metadata, the basic block properties are appended.
5750 ArrayRef<Metadata *> Properties) {
5751 // Nothing to do if no property to attach.
5752 if (Properties.empty())
5753 return;
5754
5755 LLVMContext &Ctx = BB->getContext();
5756 SmallVector<Metadata *> NewProperties;
5757 NewProperties.push_back(nullptr);
5758
5759 // If the basic block already has metadata, prepend it to the new metadata.
5760 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5761 if (Existing)
5762 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5763
5764 append_range(NewProperties, Properties);
5765 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5766 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5767
5768 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5769}
5770
5771/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5772/// loop already has metadata, the loop properties are appended.
5773static void addLoopMetadata(CanonicalLoopInfo *Loop,
5774 ArrayRef<Metadata *> Properties) {
5775 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5776
5777 // Attach metadata to the loop's latch
5778 BasicBlock *Latch = Loop->getLatch();
5779 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5780 addBasicBlockMetadata(Latch, Properties);
5781}
5782
5783/// Attach llvm.access.group metadata to the memref instructions of \p Block
5784static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5785 LoopInfo &LI) {
5786 for (Instruction &I : *Block) {
5787 if (I.mayReadOrWriteMemory()) {
5788 // TODO: This instruction may already have access group from
5789 // other pragmas e.g. #pragma clang loop vectorize. Append
5790 // so that the existing metadata is not overwritten.
5791 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5792 }
5793 }
5794}
5795
5796void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5797 LLVMContext &Ctx = Builder.getContext();
5799 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5800 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5801}
5802
5803void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5804 LLVMContext &Ctx = Builder.getContext();
5806 Loop, {
5807 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5808 });
5809}
5810
5811void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5812 Value *IfCond, ValueToValueMapTy &VMap,
5813 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5814 const Twine &NamePrefix) {
5815 Function *F = CanonicalLoop->getFunction();
5816
5817 // We can't do
5818 // if (cond) {
5819 // simd_loop;
5820 // } else {
5821 // non_simd_loop;
5822 // }
5823 // because then the CanonicalLoopInfo would only point to one of the loops:
5824 // leading to other constructs operating on the same loop to malfunction.
5825 // Instead generate
5826 // while (...) {
5827 // if (cond) {
5828 // simd_body;
5829 // } else {
5830 // not_simd_body;
5831 // }
5832 // }
5833 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5834 // body at -O3
5835
5836 // Define where if branch should be inserted
5837 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5838
5839 // Create additional blocks for the if statement
5840 BasicBlock *Cond = SplitBeforeIt->getParent();
5841 llvm::LLVMContext &C = Cond->getContext();
5843 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5845 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5846
5847 // Create if condition branch.
5848 Builder.SetInsertPoint(SplitBeforeIt);
5849 Instruction *BrInstr =
5850 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5851 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5852 // Then block contains branch to omp loop body which needs to be vectorized
5853 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5854 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5855
5856 Builder.SetInsertPoint(ElseBlock);
5857
5858 // Clone loop for the else branch
5860
5861 SmallVector<BasicBlock *, 8> ExistingBlocks;
5862 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5863 ExistingBlocks.push_back(ThenBlock);
5864 ExistingBlocks.append(L->block_begin(), L->block_end());
5865 // Cond is the block that has the if clause condition
5866 // LoopCond is omp_loop.cond
5867 // LoopHeader is omp_loop.header
5868 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5869 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5870 assert(LoopCond && LoopHeader && "Invalid loop structure");
5871 for (BasicBlock *Block : ExistingBlocks) {
5872 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5873 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5874 continue;
5875 }
5876 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5877
5878 // fix name not to be omp.if.then
5879 if (Block == ThenBlock)
5880 NewBB->setName(NamePrefix + ".if.else");
5881
5882 NewBB->moveBefore(CanonicalLoop->getExit());
5883 VMap[Block] = NewBB;
5884 NewBlocks.push_back(NewBB);
5885 }
5886 remapInstructionsInBlocks(NewBlocks, VMap);
5887 Builder.CreateBr(NewBlocks.front());
5888
5889 // The loop latch must have only one predecessor. Currently it is branched to
5890 // from both the 'then' and 'else' branches.
5891 L->getLoopLatch()->splitBasicBlock(
5892 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5893
5894 // Ensure that the then block is added to the loop so we add the attributes in
5895 // the next step
5896 L->addBasicBlockToLoop(ThenBlock, LI);
5897}
5898
5899unsigned
5900OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5901 const StringMap<bool> &Features) {
5902 if (TargetTriple.isX86()) {
5903 if (Features.lookup("avx512f"))
5904 return 512;
5905 else if (Features.lookup("avx"))
5906 return 256;
5907 return 128;
5908 }
5909 if (TargetTriple.isPPC())
5910 return 128;
5911 if (TargetTriple.isWasm())
5912 return 128;
5913 return 0;
5914}
5915
5916void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5917 MapVector<Value *, Value *> AlignedVars,
5918 Value *IfCond, OrderKind Order,
5919 ConstantInt *Simdlen, ConstantInt *Safelen) {
5920 LLVMContext &Ctx = Builder.getContext();
5921
5922 Function *F = CanonicalLoop->getFunction();
5923
5924 // TODO: We should not rely on pass manager. Currently we use pass manager
5925 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5926 // object. We should have a method which returns all blocks between
5927 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5929 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5930 FAM.registerPass([]() { return LoopAnalysis(); });
5931 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5932
5933 LoopAnalysis LIA;
5934 LoopInfo &&LI = LIA.run(*F, FAM);
5935
5936 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5937 if (AlignedVars.size()) {
5938 InsertPointTy IP = Builder.saveIP();
5939 for (auto &AlignedItem : AlignedVars) {
5940 Value *AlignedPtr = AlignedItem.first;
5941 Value *Alignment = AlignedItem.second;
5942 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5943 Builder.SetInsertPoint(loadInst->getNextNode());
5944 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5945 Alignment);
5946 }
5947 Builder.restoreIP(IP);
5948 }
5949
5950 if (IfCond) {
5951 ValueToValueMapTy VMap;
5952 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5953 }
5954
5956
5957 // Get the basic blocks from the loop in which memref instructions
5958 // can be found.
5959 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5960 // preferably without running any passes.
5961 for (BasicBlock *Block : L->getBlocks()) {
5962 if (Block == CanonicalLoop->getCond() ||
5963 Block == CanonicalLoop->getHeader())
5964 continue;
5965 Reachable.insert(Block);
5966 }
5967
5968 SmallVector<Metadata *> LoopMDList;
5969
5970 // In presence of finite 'safelen', it may be unsafe to mark all
5971 // the memory instructions parallel, because loop-carried
5972 // dependences of 'safelen' iterations are possible.
5973 // If clause order(concurrent) is specified then the memory instructions
5974 // are marked parallel even if 'safelen' is finite.
5975 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5976 // Add access group metadata to memory-access instructions.
5977 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5978 for (BasicBlock *BB : Reachable)
5979 addSimdMetadata(BB, AccessGroup, LI);
5980 // TODO: If the loop has existing parallel access metadata, have
5981 // to combine two lists.
5982 LoopMDList.push_back(MDNode::get(
5983 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5984 }
5985
5986 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5987 // versions so we can't add the loop attributes in that case.
5988 if (IfCond) {
5989 // we can still add llvm.loop.parallel_access
5990 addLoopMetadata(CanonicalLoop, LoopMDList);
5991 return;
5992 }
5993
5994 // Use the above access group metadata to create loop level
5995 // metadata, which should be distinct for each loop.
5996 ConstantAsMetadata *BoolConst =
5998 LoopMDList.push_back(MDNode::get(
5999 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6000
6001 if (Simdlen || Safelen) {
6002 // If both simdlen and safelen clauses are specified, the value of the
6003 // simdlen parameter must be less than or equal to the value of the safelen
6004 // parameter. Therefore, use safelen only in the absence of simdlen.
6005 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6006 LoopMDList.push_back(
6007 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6008 ConstantAsMetadata::get(VectorizeWidth)}));
6009 }
6010
6011 addLoopMetadata(CanonicalLoop, LoopMDList);
6012}
6013
6014/// Create the TargetMachine object to query the backend for optimization
6015/// preferences.
6016///
6017/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6018/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6019/// needed for the LLVM pass pipline. We use some default options to avoid
6020/// having to pass too many settings from the frontend that probably do not
6021/// matter.
6022///
6023/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6024/// method. If we are going to use TargetMachine for more purposes, especially
6025/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6026/// might become be worth requiring front-ends to pass on their TargetMachine,
6027/// or at least cache it between methods. Note that while fontends such as Clang
6028/// have just a single main TargetMachine per translation unit, "target-cpu" and
6029/// "target-features" that determine the TargetMachine are per-function and can
6030/// be overrided using __attribute__((target("OPTIONS"))).
6031static std::unique_ptr<TargetMachine>
6033 Module *M = F->getParent();
6034
6035 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6036 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6037 const llvm::Triple &Triple = M->getTargetTriple();
6038
6039 std::string Error;
6041 if (!TheTarget)
6042 return {};
6043
6045 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6046 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6047 /*CodeModel=*/std::nullopt, OptLevel));
6048}
6049
6050/// Heuristically determine the best-performant unroll factor for \p CLI. This
6051/// depends on the target processor. We are re-using the same heuristics as the
6052/// LoopUnrollPass.
6053static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6054 Function *F = CLI->getFunction();
6055
6056 // Assume the user requests the most aggressive unrolling, even if the rest of
6057 // the code is optimized using a lower setting.
6059 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6060
6062 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6063 FAM.registerPass([]() { return AssumptionAnalysis(); });
6064 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6065 FAM.registerPass([]() { return LoopAnalysis(); });
6066 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6067 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6068 TargetIRAnalysis TIRA;
6069 if (TM)
6070 TIRA = TargetIRAnalysis(
6071 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6072 FAM.registerPass([&]() { return TIRA; });
6073
6074 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6076 ScalarEvolution &&SE = SEA.run(*F, FAM);
6078 DominatorTree &&DT = DTA.run(*F, FAM);
6079 LoopAnalysis LIA;
6080 LoopInfo &&LI = LIA.run(*F, FAM);
6082 AssumptionCache &&AC = ACT.run(*F, FAM);
6084
6085 Loop *L = LI.getLoopFor(CLI->getHeader());
6086 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6087
6089 L, SE, TTI,
6090 /*BlockFrequencyInfo=*/nullptr,
6091 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6092 /*UserThreshold=*/std::nullopt,
6093 /*UserCount=*/std::nullopt,
6094 /*UserAllowPartial=*/true,
6095 /*UserAllowRuntime=*/true,
6096 /*UserUpperBound=*/std::nullopt,
6097 /*UserFullUnrollMaxCount=*/std::nullopt);
6098
6099 UP.Force = true;
6100
6101 // Account for additional optimizations taking place before the LoopUnrollPass
6102 // would unroll the loop.
6105
6106 // Use normal unroll factors even if the rest of the code is optimized for
6107 // size.
6110
6111 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6112 << " Threshold=" << UP.Threshold << "\n"
6113 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6114 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6115 << " PartialOptSizeThreshold="
6116 << UP.PartialOptSizeThreshold << "\n");
6117
6118 // Disable peeling.
6121 /*UserAllowPeeling=*/false,
6122 /*UserAllowProfileBasedPeeling=*/false,
6123 /*UnrollingSpecficValues=*/false);
6124
6126 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6127
6128 // Assume that reads and writes to stack variables can be eliminated by
6129 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6130 // size.
6131 for (BasicBlock *BB : L->blocks()) {
6132 for (Instruction &I : *BB) {
6133 Value *Ptr;
6134 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6135 Ptr = Load->getPointerOperand();
6136 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6137 Ptr = Store->getPointerOperand();
6138 } else
6139 continue;
6140
6141 Ptr = Ptr->stripPointerCasts();
6142
6143 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6144 if (Alloca->getParent() == &F->getEntryBlock())
6145 EphValues.insert(&I);
6146 }
6147 }
6148 }
6149
6150 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6151
6152 // Loop is not unrollable if the loop contains certain instructions.
6153 if (!UCE.canUnroll()) {
6154 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6155 return 1;
6156 }
6157
6158 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6159 << "\n");
6160
6161 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6162 // be able to use it.
6163 int TripCount = 0;
6164 int MaxTripCount = 0;
6165 bool MaxOrZero = false;
6166 unsigned TripMultiple = 0;
6167
6168 bool UseUpperBound = false;
6169 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6170 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6171 UseUpperBound);
6172 unsigned Factor = UP.Count;
6173 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6174
6175 // This function returns 1 to signal to not unroll a loop.
6176 if (Factor == 0)
6177 return 1;
6178 return Factor;
6179}
6180
6181void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6182 int32_t Factor,
6183 CanonicalLoopInfo **UnrolledCLI) {
6184 assert(Factor >= 0 && "Unroll factor must not be negative");
6185
6186 Function *F = Loop->getFunction();
6187 LLVMContext &Ctx = F->getContext();
6188
6189 // If the unrolled loop is not used for another loop-associated directive, it
6190 // is sufficient to add metadata for the LoopUnrollPass.
6191 if (!UnrolledCLI) {
6192 SmallVector<Metadata *, 2> LoopMetadata;
6193 LoopMetadata.push_back(
6194 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6195
6196 if (Factor >= 1) {
6198 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6199 LoopMetadata.push_back(MDNode::get(
6200 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6201 }
6202
6203 addLoopMetadata(Loop, LoopMetadata);
6204 return;
6205 }
6206
6207 // Heuristically determine the unroll factor.
6208 if (Factor == 0)
6210
6211 // No change required with unroll factor 1.
6212 if (Factor == 1) {
6213 *UnrolledCLI = Loop;
6214 return;
6215 }
6216
6217 assert(Factor >= 2 &&
6218 "unrolling only makes sense with a factor of 2 or larger");
6219
6220 Type *IndVarTy = Loop->getIndVarType();
6221
6222 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6223 // unroll the inner loop.
6224 Value *FactorVal =
6225 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6226 /*isSigned=*/false));
6227 std::vector<CanonicalLoopInfo *> LoopNest =
6228 tileLoops(DL, {Loop}, {FactorVal});
6229 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6230 *UnrolledCLI = LoopNest[0];
6231 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6232
6233 // LoopUnrollPass can only fully unroll loops with constant trip count.
6234 // Unroll by the unroll factor with a fallback epilog for the remainder
6235 // iterations if necessary.
6237 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6239 InnerLoop,
6240 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6242 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6243
6244#ifndef NDEBUG
6245 (*UnrolledCLI)->assertOK();
6246#endif
6247}
6248
6249OpenMPIRBuilder::InsertPointTy
6250OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6251 llvm::Value *BufSize, llvm::Value *CpyBuf,
6252 llvm::Value *CpyFn, llvm::Value *DidIt) {
6253 if (!updateToLocation(Loc))
6254 return Loc.IP;
6255
6256 uint32_t SrcLocStrSize;
6257 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6258 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6259 Value *ThreadId = getOrCreateThreadID(Ident);
6260
6261 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6262
6263 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6264
6265 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6266 Builder.CreateCall(Fn, Args);
6267
6268 return Builder.saveIP();
6269}
6270
6271OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6272 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6273 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6275
6276 if (!updateToLocation(Loc))
6277 return Loc.IP;
6278
6279 // If needed allocate and initialize `DidIt` with 0.
6280 // DidIt: flag variable: 1=single thread; 0=not single thread.
6281 llvm::Value *DidIt = nullptr;
6282 if (!CPVars.empty()) {
6283 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6284 Builder.CreateStore(Builder.getInt32(0), DidIt);
6285 }
6286
6287 Directive OMPD = Directive::OMPD_single;
6288 uint32_t SrcLocStrSize;
6289 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6290 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6291 Value *ThreadId = getOrCreateThreadID(Ident);
6292 Value *Args[] = {Ident, ThreadId};
6293
6294 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6295 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6296
6297 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6298 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6299
6300 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6301 if (Error Err = FiniCB(IP))
6302 return Err;
6303
6304 // The thread that executes the single region must set `DidIt` to 1.
6305 // This is used by __kmpc_copyprivate, to know if the caller is the
6306 // single thread or not.
6307 if (DidIt)
6308 Builder.CreateStore(Builder.getInt32(1), DidIt);
6309
6310 return Error::success();
6311 };
6312
6313 // generates the following:
6314 // if (__kmpc_single()) {
6315 // .... single region ...
6316 // __kmpc_end_single
6317 // }
6318 // __kmpc_copyprivate
6319 // __kmpc_barrier
6320
6321 InsertPointOrErrorTy AfterIP =
6322 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6323 /*Conditional*/ true,
6324 /*hasFinalize*/ true);
6325 if (!AfterIP)
6326 return AfterIP.takeError();
6327
6328 if (DidIt) {
6329 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6330 // NOTE BufSize is currently unused, so just pass 0.
6331 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6332 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6333 CPFuncs[I], DidIt);
6334 // NOTE __kmpc_copyprivate already inserts a barrier
6335 } else if (!IsNowait) {
6336 InsertPointOrErrorTy AfterIP =
6337 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6338 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6339 /* CheckCancelFlag */ false);
6340 if (!AfterIP)
6341 return AfterIP.takeError();
6342 }
6343 return Builder.saveIP();
6344}
6345
6346OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6347 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6348 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6349
6350 if (!updateToLocation(Loc))
6351 return Loc.IP;
6352
6353 Directive OMPD = Directive::OMPD_critical;
6354 uint32_t SrcLocStrSize;
6355 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6356 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6357 Value *ThreadId = getOrCreateThreadID(Ident);
6358 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6359 Value *Args[] = {Ident, ThreadId, LockVar};
6360
6361 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6362 Function *RTFn = nullptr;
6363 if (HintInst) {
6364 // Add Hint to entry Args and create call
6365 EnterArgs.push_back(HintInst);
6366 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6367 } else {
6368 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6369 }
6370 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6371
6372 Function *ExitRTLFn =
6373 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6374 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6375
6376 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6377 /*Conditional*/ false, /*hasFinalize*/ true);
6378}
6379
6380OpenMPIRBuilder::InsertPointTy
6381OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6382 InsertPointTy AllocaIP, unsigned NumLoops,
6383 ArrayRef<llvm::Value *> StoreValues,
6384 const Twine &Name, bool IsDependSource) {
6385 assert(
6386 llvm::all_of(StoreValues,
6387 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6388 "OpenMP runtime requires depend vec with i64 type");
6389
6390 if (!updateToLocation(Loc))
6391 return Loc.IP;
6392
6393 // Allocate space for vector and generate alloc instruction.
6394 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6395 Builder.restoreIP(AllocaIP);
6396 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6397 ArgsBase->setAlignment(Align(8));
6398 updateToLocation(Loc);
6399
6400 // Store the index value with offset in depend vector.
6401 for (unsigned I = 0; I < NumLoops; ++I) {
6402 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6403 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6404 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6405 STInst->setAlignment(Align(8));
6406 }
6407
6408 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6409 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6410
6411 uint32_t SrcLocStrSize;
6412 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6413 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6414 Value *ThreadId = getOrCreateThreadID(Ident);
6415 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6416
6417 Function *RTLFn = nullptr;
6418 if (IsDependSource)
6419 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6420 else
6421 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6422 Builder.CreateCall(RTLFn, Args);
6423
6424 return Builder.saveIP();
6425}
6426
6427OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6428 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6429 FinalizeCallbackTy FiniCB, bool IsThreads) {
6430 if (!updateToLocation(Loc))
6431 return Loc.IP;
6432
6433 Directive OMPD = Directive::OMPD_ordered;
6434 Instruction *EntryCall = nullptr;
6435 Instruction *ExitCall = nullptr;
6436
6437 if (IsThreads) {
6438 uint32_t SrcLocStrSize;
6439 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6440 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6441 Value *ThreadId = getOrCreateThreadID(Ident);
6442 Value *Args[] = {Ident, ThreadId};
6443
6444 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6445 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6446
6447 Function *ExitRTLFn =
6448 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6449 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6450 }
6451
6452 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6453 /*Conditional*/ false, /*hasFinalize*/ true);
6454}
6455
6456OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6457 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6458 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6459 bool HasFinalize, bool IsCancellable) {
6460
6461 if (HasFinalize)
6462 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6463
6464 // Create inlined region's entry and body blocks, in preparation
6465 // for conditional creation
6466 BasicBlock *EntryBB = Builder.GetInsertBlock();
6467 Instruction *SplitPos = EntryBB->getTerminator();
6468 if (!isa_and_nonnull<BranchInst>(SplitPos))
6469 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6470 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6471 BasicBlock *FiniBB =
6472 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6473
6474 Builder.SetInsertPoint(EntryBB->getTerminator());
6475 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6476
6477 // generate body
6478 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6479 /* CodeGenIP */ Builder.saveIP()))
6480 return Err;
6481
6482 // emit exit call and do any needed finalization.
6483 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6484 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6485 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6486 "Unexpected control flow graph state!!");
6487 InsertPointOrErrorTy AfterIP =
6488 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6489 if (!AfterIP)
6490 return AfterIP.takeError();
6491 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6492 "Unexpected Control Flow State!");
6494
6495 // If we are skipping the region of a non conditional, remove the exit
6496 // block, and clear the builder's insertion point.
6497 assert(SplitPos->getParent() == ExitBB &&
6498 "Unexpected Insertion point location!");
6499 auto merged = MergeBlockIntoPredecessor(ExitBB);
6500 BasicBlock *ExitPredBB = SplitPos->getParent();
6501 auto InsertBB = merged ? ExitPredBB : ExitBB;
6502 if (!isa_and_nonnull<BranchInst>(SplitPos))
6503 SplitPos->eraseFromParent();
6504 Builder.SetInsertPoint(InsertBB);
6505
6506 return Builder.saveIP();
6507}
6508
6509OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6510 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6511 // if nothing to do, Return current insertion point.
6512 if (!Conditional || !EntryCall)
6513 return Builder.saveIP();
6514
6515 BasicBlock *EntryBB = Builder.GetInsertBlock();
6516 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6517 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6518 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6519
6520 // Emit thenBB and set the Builder's insertion point there for
6521 // body generation next. Place the block after the current block.
6522 Function *CurFn = EntryBB->getParent();
6523 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6524
6525 // Move Entry branch to end of ThenBB, and replace with conditional
6526 // branch (If-stmt)
6527 Instruction *EntryBBTI = EntryBB->getTerminator();
6528 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6529 EntryBBTI->removeFromParent();
6530 Builder.SetInsertPoint(UI);
6531 Builder.Insert(EntryBBTI);
6532 UI->eraseFromParent();
6533 Builder.SetInsertPoint(ThenBB->getTerminator());
6534
6535 // return an insertion point to ExitBB.
6536 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6537}
6538
6539OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6540 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6541 bool HasFinalize) {
6542
6543 Builder.restoreIP(FinIP);
6544
6545 // If there is finalization to do, emit it before the exit call
6546 if (HasFinalize) {
6547 assert(!FinalizationStack.empty() &&
6548 "Unexpected finalization stack state!");
6549
6550 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6551 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6552
6553 if (Error Err = Fi.FiniCB(FinIP))
6554 return Err;
6555
6556 BasicBlock *FiniBB = FinIP.getBlock();
6557 Instruction *FiniBBTI = FiniBB->getTerminator();
6558
6559 // set Builder IP for call creation
6560 Builder.SetInsertPoint(FiniBBTI);
6561 }
6562
6563 if (!ExitCall)
6564 return Builder.saveIP();
6565
6566 // place the Exitcall as last instruction before Finalization block terminator
6567 ExitCall->removeFromParent();
6568 Builder.Insert(ExitCall);
6569
6570 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6571 ExitCall->getIterator());
6572}
6573
6574OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6575 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6576 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6577 if (!IP.isSet())
6578 return IP;
6579
6580 IRBuilder<>::InsertPointGuard IPG(Builder);
6581
6582 // creates the following CFG structure
6583 // OMP_Entry : (MasterAddr != PrivateAddr)?
6584 // F T
6585 // | \
6586 // | copin.not.master
6587 // | /
6588 // v /
6589 // copyin.not.master.end
6590 // |
6591 // v
6592 // OMP.Entry.Next
6593
6594 BasicBlock *OMP_Entry = IP.getBlock();
6595 Function *CurFn = OMP_Entry->getParent();
6596 BasicBlock *CopyBegin =
6597 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6598 BasicBlock *CopyEnd = nullptr;
6599
6600 // If entry block is terminated, split to preserve the branch to following
6601 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6602 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6603 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6604 "copyin.not.master.end");
6605 OMP_Entry->getTerminator()->eraseFromParent();
6606 } else {
6607 CopyEnd =
6608 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6609 }
6610
6611 Builder.SetInsertPoint(OMP_Entry);
6612 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6613 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6614 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6615 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6616
6617 Builder.SetInsertPoint(CopyBegin);
6618 if (BranchtoEnd)
6619 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6620
6621 return Builder.saveIP();
6622}
6623
6624CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6626 std::string Name) {
6627 IRBuilder<>::InsertPointGuard IPG(Builder);
6628 updateToLocation(Loc);
6629
6630 uint32_t SrcLocStrSize;
6631 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6632 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6633 Value *ThreadId = getOrCreateThreadID(Ident);
6634 Value *Args[] = {ThreadId, Size, Allocator};
6635
6636 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6637
6638 return Builder.CreateCall(Fn, Args, Name);
6639}
6640
6641CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6642 Value *Addr, Value *Allocator,
6643 std::string Name) {
6644 IRBuilder<>::InsertPointGuard IPG(Builder);
6645 updateToLocation(Loc);
6646
6647 uint32_t SrcLocStrSize;
6648 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6649 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6650 Value *ThreadId = getOrCreateThreadID(Ident);
6651 Value *Args[] = {ThreadId, Addr, Allocator};
6652 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6653 return Builder.CreateCall(Fn, Args, Name);
6654}
6655
6656CallInst *OpenMPIRBuilder::createOMPInteropInit(
6657 const LocationDescription &Loc, Value *InteropVar,
6658 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6659 Value *DependenceAddress, bool HaveNowaitClause) {
6660 IRBuilder<>::InsertPointGuard IPG(Builder);
6661 updateToLocation(Loc);
6662
6663 uint32_t SrcLocStrSize;
6664 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6665 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6666 Value *ThreadId = getOrCreateThreadID(Ident);
6667 if (Device == nullptr)
6669 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6670 if (NumDependences == nullptr) {
6671 NumDependences = ConstantInt::get(Int32, 0);
6672 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6673 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6674 }
6675 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6676 Value *Args[] = {
6677 Ident, ThreadId, InteropVar, InteropTypeVal,
6678 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6679
6680 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6681
6682 return Builder.CreateCall(Fn, Args);
6683}
6684
6685CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6686 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6687 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6688 IRBuilder<>::InsertPointGuard IPG(Builder);
6689 updateToLocation(Loc);
6690
6691 uint32_t SrcLocStrSize;
6692 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6693 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6694 Value *ThreadId = getOrCreateThreadID(Ident);
6695 if (Device == nullptr)
6697 if (NumDependences == nullptr) {
6698 NumDependences = ConstantInt::get(Int32, 0);
6699 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6700 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6701 }
6702 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6703 Value *Args[] = {
6704 Ident, ThreadId, InteropVar, Device,
6705 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6706
6707 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6708
6709 return Builder.CreateCall(Fn, Args);
6710}
6711
6712CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6713 Value *InteropVar, Value *Device,
6714 Value *NumDependences,
6715 Value *DependenceAddress,
6716 bool HaveNowaitClause) {
6717 IRBuilder<>::InsertPointGuard IPG(Builder);
6718 updateToLocation(Loc);
6719 uint32_t SrcLocStrSize;
6720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6722 Value *ThreadId = getOrCreateThreadID(Ident);
6723 if (Device == nullptr)
6725 if (NumDependences == nullptr) {
6726 NumDependences = ConstantInt::get(Int32, 0);
6727 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6728 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6729 }
6730 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6731 Value *Args[] = {
6732 Ident, ThreadId, InteropVar, Device,
6733 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6734
6735 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6736
6737 return Builder.CreateCall(Fn, Args);
6738}
6739
6740CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6741 const LocationDescription &Loc, llvm::Value *Pointer,
6742 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6743 IRBuilder<>::InsertPointGuard IPG(Builder);
6744 updateToLocation(Loc);
6745
6746 uint32_t SrcLocStrSize;
6747 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6748 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6749 Value *ThreadId = getOrCreateThreadID(Ident);
6750 Constant *ThreadPrivateCache =
6751 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6752 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6753
6754 Function *Fn =
6755 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6756
6757 return Builder.CreateCall(Fn, Args);
6758}
6759
6760OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6761 const LocationDescription &Loc,
6762 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6763 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6764 "expected num_threads and num_teams to be specified");
6765
6766 if (!updateToLocation(Loc))
6767 return Loc.IP;
6768
6769 uint32_t SrcLocStrSize;
6770 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6771 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6772 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6773 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6774 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6775 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6776 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6777
6778 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6779 Function *Kernel = DebugKernelWrapper;
6780
6781 // We need to strip the debug prefix to get the correct kernel name.
6782 StringRef KernelName = Kernel->getName();
6783 const std::string DebugPrefix = "_debug__";
6784 if (KernelName.ends_with(DebugPrefix)) {
6785 KernelName = KernelName.drop_back(DebugPrefix.length());
6786 Kernel = M.getFunction(KernelName);
6787 assert(Kernel && "Expected the real kernel to exist");
6788 }
6789
6790 // Manifest the launch configuration in the metadata matching the kernel
6791 // environment.
6792 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6793 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6794
6795 // If MaxThreads not set, select the maximum between the default workgroup
6796 // size and the MinThreads value.
6797 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6798 if (MaxThreadsVal < 0)
6799 MaxThreadsVal = std::max(
6800 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6801
6802 if (MaxThreadsVal > 0)
6803 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6804
6805 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6807 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6808 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6809 Constant *ReductionDataSize =
6810 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6811 Constant *ReductionBufferLength =
6812 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6813
6814 Function *Fn = getOrCreateRuntimeFunctionPtr(
6815 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6816 const DataLayout &DL = Fn->getDataLayout();
6817
6818 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6819 Constant *DynamicEnvironmentInitializer =
6820 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6821 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6822 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6823 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6824 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6825 DL.getDefaultGlobalsAddressSpace());
6826 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6827
6828 Constant *DynamicEnvironment =
6829 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6830 ? DynamicEnvironmentGV
6831 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6832 DynamicEnvironmentPtr);
6833
6834 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6835 ConfigurationEnvironment, {
6836 UseGenericStateMachineVal,
6837 MayUseNestedParallelismVal,
6838 IsSPMDVal,
6839 MinThreads,
6840 MaxThreads,
6841 MinTeams,
6842 MaxTeams,
6843 ReductionDataSize,
6844 ReductionBufferLength,
6845 });
6846 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6847 KernelEnvironment, {
6848 ConfigurationEnvironmentInitializer,
6849 Ident,
6850 DynamicEnvironment,
6851 });
6852 std::string KernelEnvironmentName =
6853 (KernelName + "_kernel_environment").str();
6854 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6855 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6856 KernelEnvironmentInitializer, KernelEnvironmentName,
6857 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6858 DL.getDefaultGlobalsAddressSpace());
6859 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6860
6861 Constant *KernelEnvironment =
6862 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6863 ? KernelEnvironmentGV
6864 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6865 KernelEnvironmentPtr);
6866 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6867 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6868 KernelLaunchEnvironment =
6869 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6870 ? KernelLaunchEnvironment
6871 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6872 KernelLaunchEnvParamTy);
6873 CallInst *ThreadKind =
6874 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6875
6876 Value *ExecUserCode = Builder.CreateICmpEQ(
6877 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6878 "exec_user_code");
6879
6880 // ThreadKind = __kmpc_target_init(...)
6881 // if (ThreadKind == -1)
6882 // user_code
6883 // else
6884 // return;
6885
6886 auto *UI = Builder.CreateUnreachable();
6887 BasicBlock *CheckBB = UI->getParent();
6888 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6889
6890 BasicBlock *WorkerExitBB = BasicBlock::Create(
6891 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6892 Builder.SetInsertPoint(WorkerExitBB);
6893 Builder.CreateRetVoid();
6894
6895 auto *CheckBBTI = CheckBB->getTerminator();
6896 Builder.SetInsertPoint(CheckBBTI);
6897 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6898
6899 CheckBBTI->eraseFromParent();
6900 UI->eraseFromParent();
6901
6902 // Continue in the "user_code" block, see diagram above and in
6903 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6904 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6905}
6906
6907void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6908 int32_t TeamsReductionDataSize,
6909 int32_t TeamsReductionBufferLength) {
6910 if (!updateToLocation(Loc))
6911 return;
6912
6913 Function *Fn = getOrCreateRuntimeFunctionPtr(
6914 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6915
6916 Builder.CreateCall(Fn, {});
6917
6918 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6919 return;
6920
6921 Function *Kernel = Builder.GetInsertBlock()->getParent();
6922 // We need to strip the debug prefix to get the correct kernel name.
6923 StringRef KernelName = Kernel->getName();
6924 const std::string DebugPrefix = "_debug__";
6925 if (KernelName.ends_with(DebugPrefix))
6926 KernelName = KernelName.drop_back(DebugPrefix.length());
6927 auto *KernelEnvironmentGV =
6928 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6929 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6930 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6931 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6932 KernelEnvironmentInitializer,
6933 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6934 NewInitializer = ConstantFoldInsertValueInstruction(
6935 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6936 {0, 8});
6937 KernelEnvironmentGV->setInitializer(NewInitializer);
6938}
6939
6940static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6941 bool Min) {
6942 if (Kernel.hasFnAttribute(Name)) {
6943 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6944 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6945 }
6946 Kernel.addFnAttr(Name, llvm::utostr(Value));
6947}
6948
6949std::pair<int32_t, int32_t>
6950OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6951 int32_t ThreadLimit =
6952 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6953
6954 if (T.isAMDGPU()) {
6955 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6956 if (!Attr.isValid() || !Attr.isStringAttribute())
6957 return {0, ThreadLimit};
6958 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6959 int32_t LB, UB;
6960 if (!llvm::to_integer(UBStr, UB, 10))
6961 return {0, ThreadLimit};
6962 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6963 if (!llvm::to_integer(LBStr, LB, 10))
6964 return {0, UB};
6965 return {LB, UB};
6966 }
6967
6968 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6969 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6970 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6971 }
6972 return {0, ThreadLimit};
6973}
6974
6975void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6976 Function &Kernel, int32_t LB,
6977 int32_t UB) {
6978 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6979
6980 if (T.isAMDGPU()) {
6981 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6982 llvm::utostr(LB) + "," + llvm::utostr(UB));
6983 return;
6984 }
6985
6986 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6987}
6988
6989std::pair<int32_t, int32_t>
6990OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6991 // TODO: Read from backend annotations if available.
6992 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6993}
6994
6995void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
6996 int32_t LB, int32_t UB) {
6997 if (T.isNVPTX())
6998 if (UB > 0)
6999 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7000 if (T.isAMDGPU())
7001 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7002
7003 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7004}
7005
7006void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7007 Function *OutlinedFn) {
7008 if (Config.isTargetDevice()) {
7010 // TODO: Determine if DSO local can be set to true.
7011 OutlinedFn->setDSOLocal(false);
7013 if (T.isAMDGCN())
7015 else if (T.isNVPTX())
7017 else if (T.isSPIRV())
7019 }
7020}
7021
7022Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7023 StringRef EntryFnIDName) {
7024 if (Config.isTargetDevice()) {
7025 assert(OutlinedFn && "The outlined function must exist if embedded");
7026 return OutlinedFn;
7027 }
7028
7029 return new GlobalVariable(
7030 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7031 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7032}
7033
7034Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7035 StringRef EntryFnName) {
7036 if (OutlinedFn)
7037 return OutlinedFn;
7038
7039 assert(!M.getGlobalVariable(EntryFnName, true) &&
7040 "Named kernel already exists?");
7041 return new GlobalVariable(
7042 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7043 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7044}
7045
7046Error OpenMPIRBuilder::emitTargetRegionFunction(
7047 TargetRegionEntryInfo &EntryInfo,
7048 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7049 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7050
7051 SmallString<64> EntryFnName;
7052 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7053
7054 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7055 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7056 if (!CBResult)
7057 return CBResult.takeError();
7058 OutlinedFn = *CBResult;
7059 } else {
7060 OutlinedFn = nullptr;
7061 }
7062
7063 // If this target outline function is not an offload entry, we don't need to
7064 // register it. This may be in the case of a false if clause, or if there are
7065 // no OpenMP targets.
7066 if (!IsOffloadEntry)
7067 return Error::success();
7068
7069 std::string EntryFnIDName =
7070 Config.isTargetDevice()
7071 ? std::string(EntryFnName)
7072 : createPlatformSpecificName({EntryFnName, "region_id"});
7073
7074 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7075 EntryFnName, EntryFnIDName);
7076 return Error::success();
7077}
7078
7079Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7080 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7081 StringRef EntryFnName, StringRef EntryFnIDName) {
7082 if (OutlinedFn)
7083 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7084 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7085 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7086 OffloadInfoManager.registerTargetRegionEntryInfo(
7087 EntryInfo, EntryAddr, OutlinedFnID,
7088 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7089 return OutlinedFnID;
7090}
7091
7092OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7093 const LocationDescription &Loc, InsertPointTy AllocaIP,
7094 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7095 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7096 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7097 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7098 BodyGenTy BodyGenType)>
7099 BodyGenCB,
7100 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7101 if (!updateToLocation(Loc))
7102 return InsertPointTy();
7103
7104 Builder.restoreIP(CodeGenIP);
7105 // Disable TargetData CodeGen on Device pass.
7106 if (Config.IsTargetDevice.value_or(false)) {
7107 if (BodyGenCB) {
7108 InsertPointOrErrorTy AfterIP =
7109 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7110 if (!AfterIP)
7111 return AfterIP.takeError();
7112 Builder.restoreIP(*AfterIP);
7113 }
7114 return Builder.saveIP();
7115 }
7116
7117 bool IsStandAlone = !BodyGenCB;
7118 MapInfosTy *MapInfo;
7119 // Generate the code for the opening of the data environment. Capture all the
7120 // arguments of the runtime call by reference because they are used in the
7121 // closing of the region.
7122 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7123 InsertPointTy CodeGenIP) -> Error {
7124 MapInfo = &GenMapInfoCB(Builder.saveIP());
7125 if (Error Err = emitOffloadingArrays(
7126 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7127 /*IsNonContiguous=*/true, DeviceAddrCB))
7128 return Err;
7129
7130 TargetDataRTArgs RTArgs;
7131 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7132
7133 // Emit the number of elements in the offloading arrays.
7134 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7135
7136 // Source location for the ident struct
7137 if (!SrcLocInfo) {
7138 uint32_t SrcLocStrSize;
7139 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7140 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7141 }
7142
7143 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7144 SrcLocInfo, DeviceID,
7145 PointerNum, RTArgs.BasePointersArray,
7146 RTArgs.PointersArray, RTArgs.SizesArray,
7147 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7148 RTArgs.MappersArray};
7149
7150 if (IsStandAlone) {
7151 assert(MapperFunc && "MapperFunc missing for standalone target data");
7152
7153 auto TaskBodyCB = [&](Value *, Value *,
7155 if (Info.HasNoWait) {
7156 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7160 }
7161
7162 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7163 OffloadingArgs);
7164
7165 if (Info.HasNoWait) {
7166 BasicBlock *OffloadContBlock =
7167 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7168 Function *CurFn = Builder.GetInsertBlock()->getParent();
7169 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7170 Builder.restoreIP(Builder.saveIP());
7171 }
7172 return Error::success();
7173 };
7174
7175 bool RequiresOuterTargetTask = Info.HasNoWait;
7176 if (!RequiresOuterTargetTask)
7177 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7178 /*TargetTaskAllocaIP=*/{}));
7179 else
7180 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7181 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7182 } else {
7183 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7184 omp::OMPRTL___tgt_target_data_begin_mapper);
7185
7186 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7187
7188 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7189 if (isa<AllocaInst>(DeviceMap.second.second)) {
7190 auto *LI =
7191 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7192 Builder.CreateStore(LI, DeviceMap.second.second);
7193 }
7194 }
7195
7196 // If device pointer privatization is required, emit the body of the
7197 // region here. It will have to be duplicated: with and without
7198 // privatization.
7199 InsertPointOrErrorTy AfterIP =
7200 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7201 if (!AfterIP)
7202 return AfterIP.takeError();
7203 Builder.restoreIP(*AfterIP);
7204 }
7205 return Error::success();
7206 };
7207
7208 // If we need device pointer privatization, we need to emit the body of the
7209 // region with no privatization in the 'else' branch of the conditional.
7210 // Otherwise, we don't have to do anything.
7211 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7212 InsertPointTy CodeGenIP) -> Error {
7213 InsertPointOrErrorTy AfterIP =
7214 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7215 if (!AfterIP)
7216 return AfterIP.takeError();
7217 Builder.restoreIP(*AfterIP);
7218 return Error::success();
7219 };
7220
7221 // Generate code for the closing of the data region.
7222 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7223 TargetDataRTArgs RTArgs;
7224 Info.EmitDebug = !MapInfo->Names.empty();
7225 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7226
7227 // Emit the number of elements in the offloading arrays.
7228 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7229
7230 // Source location for the ident struct
7231 if (!SrcLocInfo) {
7232 uint32_t SrcLocStrSize;
7233 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7234 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7235 }
7236
7237 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7238 PointerNum, RTArgs.BasePointersArray,
7239 RTArgs.PointersArray, RTArgs.SizesArray,
7240 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7241 RTArgs.MappersArray};
7242 Function *EndMapperFunc =
7243 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7244
7245 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7246 return Error::success();
7247 };
7248
7249 // We don't have to do anything to close the region if the if clause evaluates
7250 // to false.
7251 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7252 return Error::success();
7253 };
7254
7255 Error Err = [&]() -> Error {
7256 if (BodyGenCB) {
7257 Error Err = [&]() {
7258 if (IfCond)
7259 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7260 return BeginThenGen(AllocaIP, Builder.saveIP());
7261 }();
7262
7263 if (Err)
7264 return Err;
7265
7266 // If we don't require privatization of device pointers, we emit the body
7267 // in between the runtime calls. This avoids duplicating the body code.
7268 InsertPointOrErrorTy AfterIP =
7269 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7270 if (!AfterIP)
7271 return AfterIP.takeError();
7272 restoreIPandDebugLoc(Builder, *AfterIP);
7273
7274 if (IfCond)
7275 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7276 return EndThenGen(AllocaIP, Builder.saveIP());
7277 }
7278 if (IfCond)
7279 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7280 return BeginThenGen(AllocaIP, Builder.saveIP());
7281 }();
7282
7283 if (Err)
7284 return Err;
7285
7286 return Builder.saveIP();
7287}
7288
7290OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7291 bool IsGPUDistribute) {
7292 assert((IVSize == 32 || IVSize == 64) &&
7293 "IV size is not compatible with the omp runtime");
7295 if (IsGPUDistribute)
7296 Name = IVSize == 32
7297 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7298 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7299 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7300 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7301 else
7302 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7303 : omp::OMPRTL___kmpc_for_static_init_4u)
7304 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7305 : omp::OMPRTL___kmpc_for_static_init_8u);
7306
7307 return getOrCreateRuntimeFunction(M, Name);
7308}
7309
7310FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7311 bool IVSigned) {
7312 assert((IVSize == 32 || IVSize == 64) &&
7313 "IV size is not compatible with the omp runtime");
7314 RuntimeFunction Name = IVSize == 32
7315 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7316 : omp::OMPRTL___kmpc_dispatch_init_4u)
7317 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7318 : omp::OMPRTL___kmpc_dispatch_init_8u);
7319
7320 return getOrCreateRuntimeFunction(M, Name);
7321}
7322
7323FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7324 bool IVSigned) {
7325 assert((IVSize == 32 || IVSize == 64) &&
7326 "IV size is not compatible with the omp runtime");
7327 RuntimeFunction Name = IVSize == 32
7328 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7329 : omp::OMPRTL___kmpc_dispatch_next_4u)
7330 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7331 : omp::OMPRTL___kmpc_dispatch_next_8u);
7332
7333 return getOrCreateRuntimeFunction(M, Name);
7334}
7335
7336FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7337 bool IVSigned) {
7338 assert((IVSize == 32 || IVSize == 64) &&
7339 "IV size is not compatible with the omp runtime");
7340 RuntimeFunction Name = IVSize == 32
7341 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7342 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7343 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7344 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7345
7346 return getOrCreateRuntimeFunction(M, Name);
7347}
7348
7349FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7350 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7351}
7352
7354 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7355 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7356
7357 DISubprogram *NewSP = Func->getSubprogram();
7358 if (!NewSP)
7359 return;
7360
7362
7363 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7364 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7365 // Only use cached variable if the arg number matches. This is important
7366 // so that DIVariable created for privatized variables are not discarded.
7367 if (NewVar && (arg == NewVar->getArg()))
7368 return NewVar;
7369
7371 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7372 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7373 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7374 return NewVar;
7375 };
7376
7377 auto UpdateDebugRecord = [&](auto *DR) {
7378 DILocalVariable *OldVar = DR->getVariable();
7379 unsigned ArgNo = 0;
7380 for (auto Loc : DR->location_ops()) {
7381 auto Iter = ValueReplacementMap.find(Loc);
7382 if (Iter != ValueReplacementMap.end()) {
7383 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7384 ArgNo = std::get<1>(Iter->second) + 1;
7385 }
7386 }
7387 if (ArgNo != 0)
7388 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7389 };
7390
7391 // The location and scope of variable intrinsics and records still point to
7392 // the parent function of the target region. Update them.
7393 for (Instruction &I : instructions(Func)) {
7395 "Unexpected debug intrinsic");
7396 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7397 UpdateDebugRecord(&DVR);
7398 }
7399 // An extra argument is passed to the device. Create the debug data for it.
7400 if (OMPBuilder.Config.isTargetDevice()) {
7401 DICompileUnit *CU = NewSP->getUnit();
7402 Module *M = Func->getParent();
7403 DIBuilder DB(*M, true, CU);
7404 DIType *VoidPtrTy =
7405 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7406 DILocalVariable *Var = DB.createParameterVariable(
7407 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7408 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7409 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7410 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7411 &(*Func->begin()));
7412 }
7413}
7414
7416 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7417 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7418 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7419 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7420 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7421 SmallVector<Type *> ParameterTypes;
7422 if (OMPBuilder.Config.isTargetDevice()) {
7423 // Add the "implicit" runtime argument we use to provide launch specific
7424 // information for target devices.
7425 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7426 ParameterTypes.push_back(Int8PtrTy);
7427
7428 // All parameters to target devices are passed as pointers
7429 // or i64. This assumes 64-bit address spaces/pointers.
7430 for (auto &Arg : Inputs)
7431 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7432 ? Arg->getType()
7433 : Type::getInt64Ty(Builder.getContext()));
7434 } else {
7435 for (auto &Arg : Inputs)
7436 ParameterTypes.push_back(Arg->getType());
7437 }
7438
7439 auto BB = Builder.GetInsertBlock();
7440 auto M = BB->getModule();
7441 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7442 /*isVarArg*/ false);
7443 auto Func =
7444 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7445
7446 // Forward target-cpu and target-features function attributes from the
7447 // original function to the new outlined function.
7448 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7449
7450 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7451 if (TargetCpuAttr.isStringAttribute())
7452 Func->addFnAttr(TargetCpuAttr);
7453
7454 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7455 if (TargetFeaturesAttr.isStringAttribute())
7456 Func->addFnAttr(TargetFeaturesAttr);
7457
7458 if (OMPBuilder.Config.isTargetDevice()) {
7459 Value *ExecMode =
7460 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7461 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7462 }
7463
7464 // Save insert point.
7465 IRBuilder<>::InsertPointGuard IPG(Builder);
7466 // We will generate the entries in the outlined function but the debug
7467 // location may still be pointing to the parent function. Reset it now.
7468 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7469
7470 // Generate the region into the function.
7471 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7472 Builder.SetInsertPoint(EntryBB);
7473
7474 // Insert target init call in the device compilation pass.
7475 if (OMPBuilder.Config.isTargetDevice())
7476 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7477
7478 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7479
7480 // As we embed the user code in the middle of our target region after we
7481 // generate entry code, we must move what allocas we can into the entry
7482 // block to avoid possible breaking optimisations for device
7483 if (OMPBuilder.Config.isTargetDevice())
7484 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7485
7486 // Insert target deinit call in the device compilation pass.
7487 BasicBlock *OutlinedBodyBB =
7488 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7489 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7490 Builder.saveIP(),
7491 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7492 if (!AfterIP)
7493 return AfterIP.takeError();
7494 Builder.restoreIP(*AfterIP);
7495 if (OMPBuilder.Config.isTargetDevice())
7496 OMPBuilder.createTargetDeinit(Builder);
7497
7498 // Insert return instruction.
7499 Builder.CreateRetVoid();
7500
7501 // New Alloca IP at entry point of created device function.
7502 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7503 auto AllocaIP = Builder.saveIP();
7504
7505 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7506
7507 // Skip the artificial dyn_ptr on the device.
7508 const auto &ArgRange =
7509 OMPBuilder.Config.isTargetDevice()
7510 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7511 : Func->args();
7512
7514
7515 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7516 // Things like GEP's can come in the form of Constants. Constants and
7517 // ConstantExpr's do not have access to the knowledge of what they're
7518 // contained in, so we must dig a little to find an instruction so we
7519 // can tell if they're used inside of the function we're outlining. We
7520 // also replace the original constant expression with a new instruction
7521 // equivalent; an instruction as it allows easy modification in the
7522 // following loop, as we can now know the constant (instruction) is
7523 // owned by our target function and replaceUsesOfWith can now be invoked
7524 // on it (cannot do this with constants it seems). A brand new one also
7525 // allows us to be cautious as it is perhaps possible the old expression
7526 // was used inside of the function but exists and is used externally
7527 // (unlikely by the nature of a Constant, but still).
7528 // NOTE: We cannot remove dead constants that have been rewritten to
7529 // instructions at this stage, we run the risk of breaking later lowering
7530 // by doing so as we could still be in the process of lowering the module
7531 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7532 // constants we have created rewritten versions of.
7533 if (auto *Const = dyn_cast<Constant>(Input))
7534 convertUsersOfConstantsToInstructions(Const, Func, false);
7535
7536 // Collect users before iterating over them to avoid invalidating the
7537 // iteration in case a user uses Input more than once (e.g. a call
7538 // instruction).
7539 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7540 // Collect all the instructions
7542 if (auto *Instr = dyn_cast<Instruction>(User))
7543 if (Instr->getFunction() == Func)
7544 Instr->replaceUsesOfWith(Input, InputCopy);
7545 };
7546
7547 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7548
7549 // Rewrite uses of input valus to parameters.
7550 for (auto InArg : zip(Inputs, ArgRange)) {
7551 Value *Input = std::get<0>(InArg);
7552 Argument &Arg = std::get<1>(InArg);
7553 Value *InputCopy = nullptr;
7554
7555 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7556 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7557 if (!AfterIP)
7558 return AfterIP.takeError();
7559 Builder.restoreIP(*AfterIP);
7560 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7561
7562 // In certain cases a Global may be set up for replacement, however, this
7563 // Global may be used in multiple arguments to the kernel, just segmented
7564 // apart, for example, if we have a global array, that is sectioned into
7565 // multiple mappings (technically not legal in OpenMP, but there is a case
7566 // in Fortran for Common Blocks where this is neccesary), we will end up
7567 // with GEP's into this array inside the kernel, that refer to the Global
7568 // but are technically seperate arguments to the kernel for all intents and
7569 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7570 // index, it will fold into an referal to the Global, if we then encounter
7571 // this folded GEP during replacement all of the references to the
7572 // Global in the kernel will be replaced with the argument we have generated
7573 // that corresponds to it, including any other GEP's that refer to the
7574 // Global that may be other arguments. This will invalidate all of the other
7575 // preceding mapped arguments that refer to the same global that may be
7576 // seperate segments. To prevent this, we defer global processing until all
7577 // other processing has been performed.
7578 if (isa<GlobalValue>(Input)) {
7579 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7580 continue;
7581 }
7582
7584 continue;
7585
7586 ReplaceValue(Input, InputCopy, Func);
7587 }
7588
7589 // Replace all of our deferred Input values, currently just Globals.
7590 for (auto Deferred : DeferredReplacement)
7591 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7592
7593 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7594 ValueReplacementMap);
7595 return Func;
7596}
7597/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7598/// of pointers containing shared data between the parent task and the created
7599/// task.
7600static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7601 IRBuilderBase &Builder,
7602 Value *TaskWithPrivates,
7603 Type *TaskWithPrivatesTy) {
7604
7605 Type *TaskTy = OMPIRBuilder.Task;
7606 LLVMContext &Ctx = Builder.getContext();
7607 Value *TaskT =
7608 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7609 Value *Shareds = TaskT;
7610 // TaskWithPrivatesTy can be one of the following
7611 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7612 // %struct.privates }
7613 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7614 //
7615 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7616 // its first member has to be the task descriptor. TaskTy is the type of the
7617 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7618 // first member of TaskT, gives us the pointer to shared data.
7619 if (TaskWithPrivatesTy != TaskTy)
7620 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7621 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7622}
7623/// Create an entry point for a target task with the following.
7624/// It'll have the following signature
7625/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7626/// This function is called from emitTargetTask once the
7627/// code to launch the target kernel has been outlined already.
7628/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7629/// into the task structure so that the deferred target task can access this
7630/// data even after the stack frame of the generating task has been rolled
7631/// back. Offloading arrays contain base pointers, pointers, sizes etc
7632/// of the data that the target kernel will access. These in effect are the
7633/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7635 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7636 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7637 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7638
7639 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7640 // This is because PrivatesTy is the type of the structure in which
7641 // we pass the offloading arrays to the deferred target task.
7642 assert((!NumOffloadingArrays || PrivatesTy) &&
7643 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7644 "to privatize");
7645
7646 Module &M = OMPBuilder.M;
7647 // KernelLaunchFunction is the target launch function, i.e.
7648 // the function that sets up kernel arguments and calls
7649 // __tgt_target_kernel to launch the kernel on the device.
7650 //
7651 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7652
7653 // StaleCI is the CallInst which is the call to the outlined
7654 // target kernel launch function. If there are local live-in values
7655 // that the outlined function uses then these are aggregated into a structure
7656 // which is passed as the second argument. If there are no local live-in
7657 // values or if all values used by the outlined kernel are global variables,
7658 // then there's only one argument, the threadID. So, StaleCI can be
7659 //
7660 // %structArg = alloca { ptr, ptr }, align 8
7661 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7662 // store ptr %20, ptr %gep_, align 8
7663 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7664 // store ptr %21, ptr %gep_8, align 8
7665 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7666 //
7667 // OR
7668 //
7669 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7670 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7671 StaleCI->getIterator());
7672
7673 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7674
7675 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7676 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7677 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7678
7679 auto ProxyFnTy =
7680 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7681 /* isVarArg */ false);
7682 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7683 ".omp_target_task_proxy_func",
7684 Builder.GetInsertBlock()->getModule());
7685 Value *ThreadId = ProxyFn->getArg(0);
7686 Value *TaskWithPrivates = ProxyFn->getArg(1);
7687 ThreadId->setName("thread.id");
7688 TaskWithPrivates->setName("task");
7689
7690 bool HasShareds = SharedArgsOperandNo > 0;
7691 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7692 BasicBlock *EntryBB =
7693 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7694 Builder.SetInsertPoint(EntryBB);
7695
7696 SmallVector<Value *> KernelLaunchArgs;
7697 KernelLaunchArgs.reserve(StaleCI->arg_size());
7698 KernelLaunchArgs.push_back(ThreadId);
7699
7700 if (HasOffloadingArrays) {
7701 assert(TaskTy != TaskWithPrivatesTy &&
7702 "If there are offloading arrays to pass to the target"
7703 "TaskTy cannot be the same as TaskWithPrivatesTy");
7704 (void)TaskTy;
7705 Value *Privates =
7706 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7707 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7708 KernelLaunchArgs.push_back(
7709 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7710 }
7711
7712 if (HasShareds) {
7713 auto *ArgStructAlloca =
7714 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7715 assert(ArgStructAlloca &&
7716 "Unable to find the alloca instruction corresponding to arguments "
7717 "for extracted function");
7718 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7719
7720 AllocaInst *NewArgStructAlloca =
7721 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7722
7723 Value *SharedsSize =
7724 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7725
7727 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7728
7729 Builder.CreateMemCpy(
7730 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7731 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7732 KernelLaunchArgs.push_back(NewArgStructAlloca);
7733 }
7734 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7735 Builder.CreateRetVoid();
7736 return ProxyFn;
7737}
7739
7740 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7741 return GEP->getSourceElementType();
7742 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7743 return Alloca->getAllocatedType();
7744
7745 llvm_unreachable("Unhandled Instruction type");
7746 return nullptr;
7747}
7748// This function returns a struct that has at most two members.
7749// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7750// descriptor. The second member, if needed, is a struct containing arrays
7751// that need to be passed to the offloaded target kernel. For example,
7752// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7753// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7754// respectively, then the types created by this function are
7755//
7756// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7757// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7758// %struct.privates }
7759// %struct.task_with_privates is returned by this function.
7760// If there aren't any offloading arrays to pass to the target kernel,
7761// %struct.kmp_task_ompbuilder_t is returned.
7762static StructType *
7763createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7764 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7765
7766 if (OffloadingArraysToPrivatize.empty())
7767 return OMPIRBuilder.Task;
7768
7769 SmallVector<Type *, 4> StructFieldTypes;
7770 for (Value *V : OffloadingArraysToPrivatize) {
7771 assert(V->getType()->isPointerTy() &&
7772 "Expected pointer to array to privatize. Got a non-pointer value "
7773 "instead");
7774 Type *ArrayTy = getOffloadingArrayType(V);
7775 assert(ArrayTy && "ArrayType cannot be nullptr");
7776 StructFieldTypes.push_back(ArrayTy);
7777 }
7778 StructType *PrivatesStructTy =
7779 StructType::create(StructFieldTypes, "struct.privates");
7780 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7781 "struct.task_with_privates");
7782}
7784 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7785 TargetRegionEntryInfo &EntryInfo,
7786 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7787 Function *&OutlinedFn, Constant *&OutlinedFnID,
7789 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7790 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7791
7792 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7793 [&](StringRef EntryFnName) {
7794 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7795 EntryFnName, Inputs, CBFunc,
7796 ArgAccessorFuncCB);
7797 };
7798
7799 return OMPBuilder.emitTargetRegionFunction(
7800 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7801 OutlinedFnID);
7802}
7803
7804OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7805 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7806 OpenMPIRBuilder::InsertPointTy AllocaIP,
7808 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7809
7810 // The following explains the code-gen scenario for the `target` directive. A
7811 // similar scneario is followed for other device-related directives (e.g.
7812 // `target enter data`) but in similar fashion since we only need to emit task
7813 // that encapsulates the proper runtime call.
7814 //
7815 // When we arrive at this function, the target region itself has been
7816 // outlined into the function OutlinedFn.
7817 // So at ths point, for
7818 // --------------------------------------------------------------
7819 // void user_code_that_offloads(...) {
7820 // omp target depend(..) map(from:a) map(to:b) private(i)
7821 // do i = 1, 10
7822 // a(i) = b(i) + n
7823 // }
7824 //
7825 // --------------------------------------------------------------
7826 //
7827 // we have
7828 //
7829 // --------------------------------------------------------------
7830 //
7831 // void user_code_that_offloads(...) {
7832 // %.offload_baseptrs = alloca [2 x ptr], align 8
7833 // %.offload_ptrs = alloca [2 x ptr], align 8
7834 // %.offload_mappers = alloca [2 x ptr], align 8
7835 // ;; target region has been outlined and now we need to
7836 // ;; offload to it via a target task.
7837 // }
7838 // void outlined_device_function(ptr a, ptr b, ptr n) {
7839 // n = *n_ptr;
7840 // do i = 1, 10
7841 // a(i) = b(i) + n
7842 // }
7843 //
7844 // We have to now do the following
7845 // (i) Make an offloading call to outlined_device_function using the OpenMP
7846 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7847 // emitted by emitKernelLaunch
7848 // (ii) Create a task entry point function that calls kernel_launch_function
7849 // and is the entry point for the target task. See
7850 // '@.omp_target_task_proxy_func in the pseudocode below.
7851 // (iii) Create a task with the task entry point created in (ii)
7852 //
7853 // That is we create the following
7854 // struct task_with_privates {
7855 // struct kmp_task_ompbuilder_t task_struct;
7856 // struct privates {
7857 // [2 x ptr] ; baseptrs
7858 // [2 x ptr] ; ptrs
7859 // [2 x i64] ; sizes
7860 // }
7861 // }
7862 // void user_code_that_offloads(...) {
7863 // %.offload_baseptrs = alloca [2 x ptr], align 8
7864 // %.offload_ptrs = alloca [2 x ptr], align 8
7865 // %.offload_sizes = alloca [2 x i64], align 8
7866 //
7867 // %structArg = alloca { ptr, ptr, ptr }, align 8
7868 // %strucArg[0] = a
7869 // %strucArg[1] = b
7870 // %strucArg[2] = &n
7871 //
7872 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7873 // sizeof(kmp_task_ompbuilder_t),
7874 // sizeof(structArg),
7875 // @.omp_target_task_proxy_func,
7876 // ...)
7877 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7878 // sizeof(structArg))
7879 // memcpy(target_task_with_privates->privates->baseptrs,
7880 // offload_baseptrs, sizeof(offload_baseptrs)
7881 // memcpy(target_task_with_privates->privates->ptrs,
7882 // offload_ptrs, sizeof(offload_ptrs)
7883 // memcpy(target_task_with_privates->privates->sizes,
7884 // offload_sizes, sizeof(offload_sizes)
7885 // dependencies_array = ...
7886 // ;; if nowait not present
7887 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7888 // call @__kmpc_omp_task_begin_if0(...)
7889 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7890 // %target_task_with_privates)
7891 // call @__kmpc_omp_task_complete_if0(...)
7892 // }
7893 //
7894 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7895 // ptr %task) {
7896 // %structArg = alloca {ptr, ptr, ptr}
7897 // %task_ptr = getelementptr(%task, 0, 0)
7898 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7899 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7900 //
7901 // %offloading_arrays = getelementptr(%task, 0, 1)
7902 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7903 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7904 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7905 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7906 // %offload_sizes, %structArg)
7907 // }
7908 //
7909 // We need the proxy function because the signature of the task entry point
7910 // expected by kmpc_omp_task is always the same and will be different from
7911 // that of the kernel_launch function.
7912 //
7913 // kernel_launch_function is generated by emitKernelLaunch and has the
7914 // always_inline attribute. For this example, it'll look like so:
7915 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7916 // %offload_sizes, %structArg) alwaysinline {
7917 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7918 // ; load aggregated data from %structArg
7919 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7920 // ; offload_sizes
7921 // call i32 @__tgt_target_kernel(...,
7922 // outlined_device_function,
7923 // ptr %kernel_args)
7924 // }
7925 // void outlined_device_function(ptr a, ptr b, ptr n) {
7926 // n = *n_ptr;
7927 // do i = 1, 10
7928 // a(i) = b(i) + n
7929 // }
7930 //
7931 BasicBlock *TargetTaskBodyBB =
7932 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7933 BasicBlock *TargetTaskAllocaBB =
7934 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7935
7936 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7937 TargetTaskAllocaBB->begin());
7938 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7939
7940 OutlineInfo OI;
7941 OI.EntryBB = TargetTaskAllocaBB;
7942 OI.OuterAllocaBB = AllocaIP.getBlock();
7943
7944 // Add the thread ID argument.
7946 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7947 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7948
7949 // Generate the task body which will subsequently be outlined.
7950 Builder.restoreIP(TargetTaskBodyIP);
7951 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7952 return Err;
7953
7954 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7955 // it is given. These blocks are enumerated by
7956 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7957 // to be outside the region. In other words, OI.ExitBlock is expected to be
7958 // the start of the region after the outlining. We used to set OI.ExitBlock
7959 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7960 // except when the task body is a single basic block. In that case,
7961 // OI.ExitBlock is set to the single task body block and will get left out of
7962 // the outlining process. So, simply create a new empty block to which we
7963 // uncoditionally branch from where TaskBodyCB left off
7964 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7965 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7966 /*IsFinished=*/true);
7967
7968 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7969 bool NeedsTargetTask = HasNoWait && DeviceID;
7970 if (NeedsTargetTask) {
7971 for (auto *V :
7972 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7973 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7974 RTArgs.SizesArray}) {
7976 OffloadingArraysToPrivatize.push_back(V);
7977 OI.ExcludeArgsFromAggregate.push_back(V);
7978 }
7979 }
7980 }
7981 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7982 DeviceID, OffloadingArraysToPrivatize](
7983 Function &OutlinedFn) mutable {
7984 assert(OutlinedFn.hasOneUse() &&
7985 "there must be a single user for the outlined function");
7986
7987 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7988
7989 // The first argument of StaleCI is always the thread id.
7990 // The next few arguments are the pointers to offloading arrays
7991 // if any. (see OffloadingArraysToPrivatize)
7992 // Finally, all other local values that are live-in into the outlined region
7993 // end up in a structure whose pointer is passed as the last argument. This
7994 // piece of data is passed in the "shared" field of the task structure. So,
7995 // we know we have to pass shareds to the task if the number of arguments is
7996 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
7997 // thread id. Further, for safety, we assert that the number of arguments of
7998 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
7999 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8000 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8001 assert((!HasShareds ||
8002 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8003 "Wrong number of arguments for StaleCI when shareds are present");
8004 int SharedArgOperandNo =
8005 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8006
8007 StructType *TaskWithPrivatesTy =
8008 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8009 StructType *PrivatesTy = nullptr;
8010
8011 if (!OffloadingArraysToPrivatize.empty())
8012 PrivatesTy =
8013 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8014
8016 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8017 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8018
8019 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8020 << "\n");
8021
8022 Builder.SetInsertPoint(StaleCI);
8023
8024 // Gather the arguments for emitting the runtime call.
8025 uint32_t SrcLocStrSize;
8026 Constant *SrcLocStr =
8027 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8028 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8029
8030 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8031 //
8032 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8033 // the DeviceID to the deferred task and also since
8034 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8035 Function *TaskAllocFn =
8036 !NeedsTargetTask
8037 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8038 : getOrCreateRuntimeFunctionPtr(
8039 OMPRTL___kmpc_omp_target_task_alloc);
8040
8041 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8042 // call.
8043 Value *ThreadID = getOrCreateThreadID(Ident);
8044
8045 // Argument - `sizeof_kmp_task_t` (TaskSize)
8046 // Tasksize refers to the size in bytes of kmp_task_t data structure
8047 // plus any other data to be passed to the target task, if any, which
8048 // is packed into a struct. kmp_task_t and the struct so created are
8049 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8050 Value *TaskSize = Builder.getInt64(
8051 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8052
8053 // Argument - `sizeof_shareds` (SharedsSize)
8054 // SharedsSize refers to the shareds array size in the kmp_task_t data
8055 // structure.
8056 Value *SharedsSize = Builder.getInt64(0);
8057 if (HasShareds) {
8058 auto *ArgStructAlloca =
8059 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8060 assert(ArgStructAlloca &&
8061 "Unable to find the alloca instruction corresponding to arguments "
8062 "for extracted function");
8063 auto *ArgStructType =
8064 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8065 assert(ArgStructType && "Unable to find struct type corresponding to "
8066 "arguments for extracted function");
8067 SharedsSize =
8068 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8069 }
8070
8071 // Argument - `flags`
8072 // Task is tied iff (Flags & 1) == 1.
8073 // Task is untied iff (Flags & 1) == 0.
8074 // Task is final iff (Flags & 2) == 2.
8075 // Task is not final iff (Flags & 2) == 0.
8076 // A target task is not final and is untied.
8077 Value *Flags = Builder.getInt32(0);
8078
8079 // Emit the @__kmpc_omp_task_alloc runtime call
8080 // The runtime call returns a pointer to an area where the task captured
8081 // variables must be copied before the task is run (TaskData)
8082 CallInst *TaskData = nullptr;
8083
8084 SmallVector<llvm::Value *> TaskAllocArgs = {
8085 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8086 /*flags=*/Flags,
8087 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8088 /*task_func=*/ProxyFn};
8089
8090 if (NeedsTargetTask) {
8091 assert(DeviceID && "Expected non-empty device ID.");
8092 TaskAllocArgs.push_back(DeviceID);
8093 }
8094
8095 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8096
8097 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8098 if (HasShareds) {
8099 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8101 *this, Builder, TaskData, TaskWithPrivatesTy);
8102 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8103 SharedsSize);
8104 }
8105 if (!OffloadingArraysToPrivatize.empty()) {
8106 Value *Privates =
8107 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8108 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8109 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8110 [[maybe_unused]] Type *ArrayType =
8111 getOffloadingArrayType(PtrToPrivatize);
8112 assert(ArrayType && "ArrayType cannot be nullptr");
8113
8114 Type *ElementType = PrivatesTy->getElementType(i);
8115 assert(ElementType == ArrayType &&
8116 "ElementType should match ArrayType");
8117 (void)ArrayType;
8118
8119 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8120 Builder.CreateMemCpy(
8121 Dst, Alignment, PtrToPrivatize, Alignment,
8122 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8123 }
8124 }
8125
8126 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8127
8128 // ---------------------------------------------------------------
8129 // V5.2 13.8 target construct
8130 // If the nowait clause is present, execution of the target task
8131 // may be deferred. If the nowait clause is not present, the target task is
8132 // an included task.
8133 // ---------------------------------------------------------------
8134 // The above means that the lack of a nowait on the target construct
8135 // translates to '#pragma omp task if(0)'
8136 if (!NeedsTargetTask) {
8137 if (DepArray) {
8138 Function *TaskWaitFn =
8139 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8140 Builder.CreateCall(
8141 TaskWaitFn,
8142 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8143 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8144 /*dep_list=*/DepArray,
8145 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8146 /*noalias_dep_list=*/
8148 }
8149 // Included task.
8150 Function *TaskBeginFn =
8151 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8152 Function *TaskCompleteFn =
8153 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8154 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8155 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8156 CI->setDebugLoc(StaleCI->getDebugLoc());
8157 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8158 } else if (DepArray) {
8159 // HasNoWait - meaning the task may be deferred. Call
8160 // __kmpc_omp_task_with_deps if there are dependencies,
8161 // else call __kmpc_omp_task
8162 Function *TaskFn =
8163 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8164 Builder.CreateCall(
8165 TaskFn,
8166 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8167 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8169 } else {
8170 // Emit the @__kmpc_omp_task runtime call to spawn the task
8171 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8172 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8173 }
8174
8175 StaleCI->eraseFromParent();
8176 for (Instruction *I : llvm::reverse(ToBeDeleted))
8177 I->eraseFromParent();
8178 };
8179 addOutlineInfo(std::move(OI));
8180
8181 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8182 << *(Builder.GetInsertBlock()) << "\n");
8183 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8184 << *(Builder.GetInsertBlock()->getParent()->getParent())
8185 << "\n");
8186 return Builder.saveIP();
8187}
8188
8189Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8190 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8191 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8192 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8193 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8194 if (Error Err =
8195 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8196 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8197 return Err;
8198 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8199 return Error::success();
8200}
8201
8202static void emitTargetCall(
8203 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8204 OpenMPIRBuilder::InsertPointTy AllocaIP,
8205 OpenMPIRBuilder::TargetDataInfo &Info,
8206 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8207 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8208 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8210 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8211 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8213 bool HasNoWait) {
8214 // Generate a function call to the host fallback implementation of the target
8215 // region. This is called by the host when no offload entry was generated for
8216 // the target region and when the offloading call fails at runtime.
8217 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8218 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8219 Builder.restoreIP(IP);
8220 Builder.CreateCall(OutlinedFn, Args);
8221 return Builder.saveIP();
8222 };
8223
8224 bool HasDependencies = Dependencies.size() > 0;
8225 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8226
8227 OpenMPIRBuilder::TargetKernelArgs KArgs;
8228
8229 auto TaskBodyCB =
8230 [&](Value *DeviceID, Value *RTLoc,
8231 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8232 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8233 // produce any.
8234 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8235 // emitKernelLaunch makes the necessary runtime call to offload the
8236 // kernel. We then outline all that code into a separate function
8237 // ('kernel_launch_function' in the pseudo code above). This function is
8238 // then called by the target task proxy function (see
8239 // '@.omp_target_task_proxy_func' in the pseudo code above)
8240 // "@.omp_target_task_proxy_func' is generated by
8241 // emitTargetTaskProxyFunction.
8242 if (OutlinedFnID && DeviceID)
8243 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8244 EmitTargetCallFallbackCB, KArgs,
8245 DeviceID, RTLoc, TargetTaskAllocaIP);
8246
8247 // We only need to do the outlining if `DeviceID` is set to avoid calling
8248 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8249 // generating the `else` branch of an `if` clause.
8250 //
8251 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8252 // In this case, we execute the host implementation directly.
8253 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8254 }());
8255
8256 OMPBuilder.Builder.restoreIP(AfterIP);
8257 return Error::success();
8258 };
8259
8260 auto &&EmitTargetCallElse =
8261 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8262 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8263 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8264 // produce any.
8265 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8266 if (RequiresOuterTargetTask) {
8267 // Arguments that are intended to be directly forwarded to an
8268 // emitKernelLaunch call are pased as nullptr, since
8269 // OutlinedFnID=nullptr results in that call not being done.
8270 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8271 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8272 /*RTLoc=*/nullptr, AllocaIP,
8273 Dependencies, EmptyRTArgs, HasNoWait);
8274 }
8275 return EmitTargetCallFallbackCB(Builder.saveIP());
8276 }());
8277
8278 Builder.restoreIP(AfterIP);
8279 return Error::success();
8280 };
8281
8282 auto &&EmitTargetCallThen =
8283 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8284 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8285 Info.HasNoWait = HasNoWait;
8286 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8287 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8288 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8289 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8290 /*IsNonContiguous=*/true,
8291 /*ForEndCall=*/false))
8292 return Err;
8293
8294 SmallVector<Value *, 3> NumTeamsC;
8295 for (auto [DefaultVal, RuntimeVal] :
8296 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8297 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8298 : Builder.getInt32(DefaultVal));
8299
8300 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8301 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8302 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8303 if (Clause)
8304 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8305 /*isSigned=*/false);
8306 return Clause;
8307 };
8308 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8309 if (Clause)
8310 Result =
8311 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8312 Result, Clause)
8313 : Clause;
8314 };
8315
8316 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8317 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8318 SmallVector<Value *, 3> NumThreadsC;
8319 Value *MaxThreadsClause =
8320 RuntimeAttrs.TeamsThreadLimit.size() == 1
8321 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8322 : nullptr;
8323
8324 for (auto [TeamsVal, TargetVal] : zip_equal(
8325 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8326 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8327 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8328
8329 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8330 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8331
8332 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8333 }
8334
8335 unsigned NumTargetItems = Info.NumberOfPtrs;
8336 // TODO: Use correct device ID
8337 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8338 uint32_t SrcLocStrSize;
8339 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8340 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8341 llvm::omp::IdentFlag(0), 0);
8342
8343 Value *TripCount = RuntimeAttrs.LoopTripCount
8344 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8345 Builder.getInt64Ty(),
8346 /*isSigned=*/false)
8347 : Builder.getInt64(0);
8348
8349 // TODO: Use correct DynCGGroupMem
8350 Value *DynCGGroupMem = Builder.getInt32(0);
8351
8352 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8353 NumTeamsC, NumThreadsC,
8354 DynCGGroupMem, HasNoWait);
8355
8356 // Assume no error was returned because TaskBodyCB and
8357 // EmitTargetCallFallbackCB don't produce any.
8358 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8359 // The presence of certain clauses on the target directive require the
8360 // explicit generation of the target task.
8361 if (RequiresOuterTargetTask)
8362 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8363 Dependencies, KArgs.RTArgs,
8364 Info.HasNoWait);
8365
8366 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8367 EmitTargetCallFallbackCB, KArgs,
8368 DeviceID, RTLoc, AllocaIP);
8369 }());
8370
8371 Builder.restoreIP(AfterIP);
8372 return Error::success();
8373 };
8374
8375 // If we don't have an ID for the target region, it means an offload entry
8376 // wasn't created. In this case we just run the host fallback directly and
8377 // ignore any potential 'if' clauses.
8378 if (!OutlinedFnID) {
8379 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8380 return;
8381 }
8382
8383 // If there's no 'if' clause, only generate the kernel launch code path.
8384 if (!IfCond) {
8385 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8386 return;
8387 }
8388
8389 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8390 EmitTargetCallElse, AllocaIP));
8391}
8392
8393OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8394 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8395 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8396 TargetRegionEntryInfo &EntryInfo,
8397 const TargetKernelDefaultAttrs &DefaultAttrs,
8398 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8399 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8400 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8401 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8402 CustomMapperCallbackTy CustomMapperCB,
8403 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8404
8405 if (!updateToLocation(Loc))
8406 return InsertPointTy();
8407
8408 Builder.restoreIP(CodeGenIP);
8409
8410 Function *OutlinedFn;
8411 Constant *OutlinedFnID = nullptr;
8412 // The target region is outlined into its own function. The LLVM IR for
8413 // the target region itself is generated using the callbacks CBFunc
8414 // and ArgAccessorFuncCB
8416 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8417 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8418 return Err;
8419
8420 // If we are not on the target device, then we need to generate code
8421 // to make a remote call (offload) to the previously outlined function
8422 // that represents the target region. Do that now.
8423 if (!Config.isTargetDevice())
8424 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8425 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8426 CustomMapperCB, Dependencies, HasNowait);
8427 return Builder.saveIP();
8428}
8429
8430std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8431 StringRef FirstSeparator,
8432 StringRef Separator) {
8433 SmallString<128> Buffer;
8434 llvm::raw_svector_ostream OS(Buffer);
8435 StringRef Sep = FirstSeparator;
8436 for (StringRef Part : Parts) {
8437 OS << Sep << Part;
8438 Sep = Separator;
8439 }
8440 return OS.str().str();
8441}
8442
8443std::string
8444OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8445 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8446 Config.separator());
8447}
8448
8450OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8451 unsigned AddressSpace) {
8452 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8453 if (Elem.second) {
8454 assert(Elem.second->getValueType() == Ty &&
8455 "OMP internal variable has different type than requested");
8456 } else {
8457 // TODO: investigate the appropriate linkage type used for the global
8458 // variable for possibly changing that to internal or private, or maybe
8459 // create different versions of the function for different OMP internal
8460 // variables.
8461 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8464 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8465 Constant::getNullValue(Ty), Elem.first(),
8466 /*InsertBefore=*/nullptr,
8468 const DataLayout &DL = M.getDataLayout();
8469 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8470 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8471 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8472 Elem.second = GV;
8473 }
8474
8475 return Elem.second;
8476}
8477
8478Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8479 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8480 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8481 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8482}
8483
8484Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8485 LLVMContext &Ctx = Builder.getContext();
8486 Value *Null =
8488 Value *SizeGep =
8489 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8490 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8491 return SizePtrToInt;
8492}
8493
8495OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8496 std::string VarName) {
8497 llvm::Constant *MaptypesArrayInit =
8498 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8499 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8500 M, MaptypesArrayInit->getType(),
8501 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8502 VarName);
8503 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8504 return MaptypesArrayGlobal;
8505}
8506
8507void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8508 InsertPointTy AllocaIP,
8509 unsigned NumOperands,
8510 struct MapperAllocas &MapperAllocas) {
8511 if (!updateToLocation(Loc))
8512 return;
8513
8514 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8515 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8516 Builder.restoreIP(AllocaIP);
8517 AllocaInst *ArgsBase = Builder.CreateAlloca(
8518 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8519 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8520 ".offload_ptrs");
8521 AllocaInst *ArgSizes = Builder.CreateAlloca(
8522 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8523 updateToLocation(Loc);
8524 MapperAllocas.ArgsBase = ArgsBase;
8525 MapperAllocas.Args = Args;
8526 MapperAllocas.ArgSizes = ArgSizes;
8527}
8528
8529void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8530 Function *MapperFunc, Value *SrcLocInfo,
8531 Value *MaptypesArg, Value *MapnamesArg,
8532 struct MapperAllocas &MapperAllocas,
8533 int64_t DeviceID, unsigned NumOperands) {
8534 if (!updateToLocation(Loc))
8535 return;
8536
8537 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8538 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8539 Value *ArgsBaseGEP =
8540 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8541 {Builder.getInt32(0), Builder.getInt32(0)});
8542 Value *ArgsGEP =
8543 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8544 {Builder.getInt32(0), Builder.getInt32(0)});
8545 Value *ArgSizesGEP =
8546 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8547 {Builder.getInt32(0), Builder.getInt32(0)});
8548 Value *NullPtr =
8549 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8550 Builder.CreateCall(MapperFunc,
8551 {SrcLocInfo, Builder.getInt64(DeviceID),
8552 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8553 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8554}
8555
8556void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8557 TargetDataRTArgs &RTArgs,
8558 TargetDataInfo &Info,
8559 bool ForEndCall) {
8560 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8561 "expected region end call to runtime only when end call is separate");
8562 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8563 auto VoidPtrTy = UnqualPtrTy;
8564 auto VoidPtrPtrTy = UnqualPtrTy;
8565 auto Int64Ty = Type::getInt64Ty(M.getContext());
8566 auto Int64PtrTy = UnqualPtrTy;
8567
8568 if (!Info.NumberOfPtrs) {
8569 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8570 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8571 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8572 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8573 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8574 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8575 return;
8576 }
8577
8578 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8579 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8580 Info.RTArgs.BasePointersArray,
8581 /*Idx0=*/0, /*Idx1=*/0);
8582 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8583 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8584 /*Idx0=*/0,
8585 /*Idx1=*/0);
8586 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8587 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8588 /*Idx0=*/0, /*Idx1=*/0);
8589 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8590 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8591 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8592 : Info.RTArgs.MapTypesArray,
8593 /*Idx0=*/0,
8594 /*Idx1=*/0);
8595
8596 // Only emit the mapper information arrays if debug information is
8597 // requested.
8598 if (!Info.EmitDebug)
8599 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8600 else
8601 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8602 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8603 /*Idx0=*/0,
8604 /*Idx1=*/0);
8605 // If there is no user-defined mapper, set the mapper array to nullptr to
8606 // avoid an unnecessary data privatization
8607 if (!Info.HasMapper)
8608 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8609 else
8610 RTArgs.MappersArray =
8611 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8612}
8613
8614void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8615 InsertPointTy CodeGenIP,
8616 MapInfosTy &CombinedInfo,
8617 TargetDataInfo &Info) {
8618 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8619 CombinedInfo.NonContigInfo;
8620
8621 // Build an array of struct descriptor_dim and then assign it to
8622 // offload_args.
8623 //
8624 // struct descriptor_dim {
8625 // uint64_t offset;
8626 // uint64_t count;
8627 // uint64_t stride
8628 // };
8629 Type *Int64Ty = Builder.getInt64Ty();
8631 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8632 "struct.descriptor_dim");
8633
8634 enum { OffsetFD = 0, CountFD, StrideFD };
8635 // We need two index variable here since the size of "Dims" is the same as
8636 // the size of Components, however, the size of offset, count, and stride is
8637 // equal to the size of base declaration that is non-contiguous.
8638 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8639 // Skip emitting ir if dimension size is 1 since it cannot be
8640 // non-contiguous.
8641 if (NonContigInfo.Dims[I] == 1)
8642 continue;
8643 Builder.restoreIP(AllocaIP);
8644 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8645 AllocaInst *DimsAddr =
8646 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8647 Builder.restoreIP(CodeGenIP);
8648 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8649 unsigned RevIdx = EE - II - 1;
8650 Value *DimsLVal = Builder.CreateInBoundsGEP(
8651 DimsAddr->getAllocatedType(), DimsAddr,
8652 {Builder.getInt64(0), Builder.getInt64(II)});
8653 // Offset
8654 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8655 Builder.CreateAlignedStore(
8656 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8657 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8658 // Count
8659 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8660 Builder.CreateAlignedStore(
8661 NonContigInfo.Counts[L][RevIdx], CountLVal,
8662 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8663 // Stride
8664 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8665 Builder.CreateAlignedStore(
8666 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8667 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8668 }
8669 // args[I] = &dims
8670 Builder.restoreIP(CodeGenIP);
8671 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8672 DimsAddr, Builder.getPtrTy());
8673 Value *P = Builder.CreateConstInBoundsGEP2_32(
8674 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8675 Info.RTArgs.PointersArray, 0, I);
8676 Builder.CreateAlignedStore(
8677 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8678 ++L;
8679 }
8680}
8681
8682void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8683 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8684 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8685 BasicBlock *ExitBB, bool IsInit) {
8686 StringRef Prefix = IsInit ? ".init" : ".del";
8687
8688 // Evaluate if this is an array section.
8690 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8691 Value *IsArray =
8692 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8693 Value *DeleteBit = Builder.CreateAnd(
8694 MapType,
8695 Builder.getInt64(
8696 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8697 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8698 Value *DeleteCond;
8699 Value *Cond;
8700 if (IsInit) {
8701 // base != begin?
8702 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8703 // IsPtrAndObj?
8704 Value *PtrAndObjBit = Builder.CreateAnd(
8705 MapType,
8706 Builder.getInt64(
8707 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8708 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8709 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8710 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8711 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8712 DeleteCond = Builder.CreateIsNull(
8713 DeleteBit,
8714 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8715 } else {
8716 Cond = IsArray;
8717 DeleteCond = Builder.CreateIsNotNull(
8718 DeleteBit,
8719 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8720 }
8721 Cond = Builder.CreateAnd(Cond, DeleteCond);
8722 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8723
8724 emitBlock(BodyBB, MapperFn);
8725 // Get the array size by multiplying element size and element number (i.e., \p
8726 // Size).
8727 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8728 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8729 // memory allocation/deletion purpose only.
8730 Value *MapTypeArg = Builder.CreateAnd(
8731 MapType,
8732 Builder.getInt64(
8733 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8734 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8735 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8736 MapTypeArg = Builder.CreateOr(
8737 MapTypeArg,
8738 Builder.getInt64(
8739 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8740 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8741
8742 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8743 // data structure.
8744 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8745 ArraySize, MapTypeArg, MapName};
8746 Builder.CreateCall(
8747 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8748 OffloadingArgs);
8749}
8750
8751Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8752 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8753 llvm::Value *BeginArg)>
8754 GenMapInfoCB,
8755 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8756 SmallVector<Type *> Params;
8757 Params.emplace_back(Builder.getPtrTy());
8758 Params.emplace_back(Builder.getPtrTy());
8759 Params.emplace_back(Builder.getPtrTy());
8760 Params.emplace_back(Builder.getInt64Ty());
8761 Params.emplace_back(Builder.getInt64Ty());
8762 Params.emplace_back(Builder.getPtrTy());
8763
8764 auto *FnTy =
8765 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8766
8767 SmallString<64> TyStr;
8768 raw_svector_ostream Out(TyStr);
8769 Function *MapperFn =
8771 MapperFn->addFnAttr(Attribute::NoInline);
8772 MapperFn->addFnAttr(Attribute::NoUnwind);
8773 MapperFn->addParamAttr(0, Attribute::NoUndef);
8774 MapperFn->addParamAttr(1, Attribute::NoUndef);
8775 MapperFn->addParamAttr(2, Attribute::NoUndef);
8776 MapperFn->addParamAttr(3, Attribute::NoUndef);
8777 MapperFn->addParamAttr(4, Attribute::NoUndef);
8778 MapperFn->addParamAttr(5, Attribute::NoUndef);
8779
8780 // Start the mapper function code generation.
8781 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8782 auto SavedIP = Builder.saveIP();
8783 Builder.SetInsertPoint(EntryBB);
8784
8785 Value *MapperHandle = MapperFn->getArg(0);
8786 Value *BaseIn = MapperFn->getArg(1);
8787 Value *BeginIn = MapperFn->getArg(2);
8788 Value *Size = MapperFn->getArg(3);
8789 Value *MapType = MapperFn->getArg(4);
8790 Value *MapName = MapperFn->getArg(5);
8791
8792 // Compute the starting and end addresses of array elements.
8793 // Prepare common arguments for array initiation and deletion.
8794 // Convert the size in bytes into the number of array elements.
8795 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8796 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8797 Value *PtrBegin = BeginIn;
8798 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8799
8800 // Emit array initiation if this is an array section and \p MapType indicates
8801 // that memory allocation is required.
8802 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8803 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8804 MapType, MapName, ElementSize, HeadBB,
8805 /*IsInit=*/true);
8806
8807 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8808
8809 // Emit the loop header block.
8810 emitBlock(HeadBB, MapperFn);
8811 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8812 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8813 // Evaluate whether the initial condition is satisfied.
8814 Value *IsEmpty =
8815 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8816 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8817
8818 // Emit the loop body block.
8819 emitBlock(BodyBB, MapperFn);
8820 BasicBlock *LastBB = BodyBB;
8821 PHINode *PtrPHI =
8822 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8823 PtrPHI->addIncoming(PtrBegin, HeadBB);
8824
8825 // Get map clause information. Fill up the arrays with all mapped variables.
8826 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8827 if (!Info)
8828 return Info.takeError();
8829
8830 // Call the runtime API __tgt_mapper_num_components to get the number of
8831 // pre-existing components.
8832 Value *OffloadingArgs[] = {MapperHandle};
8833 Value *PreviousSize = Builder.CreateCall(
8834 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8835 OffloadingArgs);
8836 Value *ShiftedPreviousSize =
8837 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8838
8839 // Fill up the runtime mapper handle for all components.
8840 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8841 Value *CurBaseArg = Info->BasePointers[I];
8842 Value *CurBeginArg = Info->Pointers[I];
8843 Value *CurSizeArg = Info->Sizes[I];
8844 Value *CurNameArg = Info->Names.size()
8845 ? Info->Names[I]
8846 : Constant::getNullValue(Builder.getPtrTy());
8847
8848 // Extract the MEMBER_OF field from the map type.
8849 Value *OriMapType = Builder.getInt64(
8850 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8851 Info->Types[I]));
8852 Value *MemberMapType =
8853 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8854
8855 // Combine the map type inherited from user-defined mapper with that
8856 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8857 // bits of the \a MapType, which is the input argument of the mapper
8858 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8859 // bits of MemberMapType.
8860 // [OpenMP 5.0], 1.2.6. map-type decay.
8861 // | alloc | to | from | tofrom | release | delete
8862 // ----------------------------------------------------------
8863 // alloc | alloc | alloc | alloc | alloc | release | delete
8864 // to | alloc | to | alloc | to | release | delete
8865 // from | alloc | alloc | from | from | release | delete
8866 // tofrom | alloc | to | from | tofrom | release | delete
8867 Value *LeftToFrom = Builder.CreateAnd(
8868 MapType,
8869 Builder.getInt64(
8870 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8871 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8872 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8873 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8874 BasicBlock *AllocElseBB =
8875 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8876 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8877 BasicBlock *ToElseBB =
8878 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8879 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8880 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8881 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8882 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8883 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8884 emitBlock(AllocBB, MapperFn);
8885 Value *AllocMapType = Builder.CreateAnd(
8886 MemberMapType,
8887 Builder.getInt64(
8888 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8889 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8890 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8891 Builder.CreateBr(EndBB);
8892 emitBlock(AllocElseBB, MapperFn);
8893 Value *IsTo = Builder.CreateICmpEQ(
8894 LeftToFrom,
8895 Builder.getInt64(
8896 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8897 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8898 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8899 // In case of to, clear OMP_MAP_FROM.
8900 emitBlock(ToBB, MapperFn);
8901 Value *ToMapType = Builder.CreateAnd(
8902 MemberMapType,
8903 Builder.getInt64(
8904 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8905 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8906 Builder.CreateBr(EndBB);
8907 emitBlock(ToElseBB, MapperFn);
8908 Value *IsFrom = Builder.CreateICmpEQ(
8909 LeftToFrom,
8910 Builder.getInt64(
8911 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8912 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8913 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8914 // In case of from, clear OMP_MAP_TO.
8915 emitBlock(FromBB, MapperFn);
8916 Value *FromMapType = Builder.CreateAnd(
8917 MemberMapType,
8918 Builder.getInt64(
8919 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8920 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8921 // In case of tofrom, do nothing.
8922 emitBlock(EndBB, MapperFn);
8923 LastBB = EndBB;
8924 PHINode *CurMapType =
8925 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8926 CurMapType->addIncoming(AllocMapType, AllocBB);
8927 CurMapType->addIncoming(ToMapType, ToBB);
8928 CurMapType->addIncoming(FromMapType, FromBB);
8929 CurMapType->addIncoming(MemberMapType, ToElseBB);
8930
8931 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8932 CurSizeArg, CurMapType, CurNameArg};
8933
8934 auto ChildMapperFn = CustomMapperCB(I);
8935 if (!ChildMapperFn)
8936 return ChildMapperFn.takeError();
8937 if (*ChildMapperFn) {
8938 // Call the corresponding mapper function.
8939 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8940 } else {
8941 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8942 // data structure.
8943 Builder.CreateCall(
8944 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8945 OffloadingArgs);
8946 }
8947 }
8948
8949 // Update the pointer to point to the next element that needs to be mapped,
8950 // and check whether we have mapped all elements.
8951 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8952 "omp.arraymap.next");
8953 PtrPHI->addIncoming(PtrNext, LastBB);
8954 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8955 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8956 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8957
8958 emitBlock(ExitBB, MapperFn);
8959 // Emit array deletion if this is an array section and \p MapType indicates
8960 // that deletion is required.
8961 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8962 MapType, MapName, ElementSize, DoneBB,
8963 /*IsInit=*/false);
8964
8965 // Emit the function exit block.
8966 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8967
8968 Builder.CreateRetVoid();
8969 Builder.restoreIP(SavedIP);
8970 return MapperFn;
8971}
8972
8973Error OpenMPIRBuilder::emitOffloadingArrays(
8974 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8975 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8976 bool IsNonContiguous,
8977 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8978
8979 // Reset the array information.
8980 Info.clearArrayInfo();
8981 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8982
8983 if (Info.NumberOfPtrs == 0)
8984 return Error::success();
8985
8986 Builder.restoreIP(AllocaIP);
8987 // Detect if we have any capture size requiring runtime evaluation of the
8988 // size so that a constant array could be eventually used.
8989 ArrayType *PointerArrayType =
8990 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8991
8992 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8993 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8994
8995 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8996 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8997 AllocaInst *MappersArray = Builder.CreateAlloca(
8998 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8999 Info.RTArgs.MappersArray = MappersArray;
9000
9001 // If we don't have any VLA types or other types that require runtime
9002 // evaluation, we can use a constant array for the map sizes, otherwise we
9003 // need to fill up the arrays as we do for the pointers.
9004 Type *Int64Ty = Builder.getInt64Ty();
9005 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9006 ConstantInt::get(Int64Ty, 0));
9007 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9008 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9009 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9010 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9011 if (IsNonContiguous &&
9012 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9013 CombinedInfo.Types[I] &
9014 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9015 ConstSizes[I] =
9016 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9017 else
9018 ConstSizes[I] = CI;
9019 continue;
9020 }
9021 }
9022 RuntimeSizes.set(I);
9023 }
9024
9025 if (RuntimeSizes.all()) {
9026 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9027 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9028 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9029 restoreIPandDebugLoc(Builder, CodeGenIP);
9030 } else {
9031 auto *SizesArrayInit = ConstantArray::get(
9032 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9033 std::string Name = createPlatformSpecificName({"offload_sizes"});
9034 auto *SizesArrayGbl =
9035 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9036 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9037 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9038
9039 if (!RuntimeSizes.any()) {
9040 Info.RTArgs.SizesArray = SizesArrayGbl;
9041 } else {
9042 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9043 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9044 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9045 AllocaInst *Buffer = Builder.CreateAlloca(
9046 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9047 Buffer->setAlignment(OffloadSizeAlign);
9048 restoreIPandDebugLoc(Builder, CodeGenIP);
9049 Builder.CreateMemCpy(
9050 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9051 SizesArrayGbl, OffloadSizeAlign,
9052 Builder.getIntN(
9053 IndexSize,
9054 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9055
9056 Info.RTArgs.SizesArray = Buffer;
9057 }
9058 restoreIPandDebugLoc(Builder, CodeGenIP);
9059 }
9060
9061 // The map types are always constant so we don't need to generate code to
9062 // fill arrays. Instead, we create an array constant.
9064 for (auto mapFlag : CombinedInfo.Types)
9065 Mapping.push_back(
9066 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9067 mapFlag));
9068 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9069 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9070 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9071
9072 // The information types are only built if provided.
9073 if (!CombinedInfo.Names.empty()) {
9074 auto *MapNamesArrayGbl = createOffloadMapnames(
9075 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9076 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9077 Info.EmitDebug = true;
9078 } else {
9079 Info.RTArgs.MapNamesArray =
9080 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9081 Info.EmitDebug = false;
9082 }
9083
9084 // If there's a present map type modifier, it must not be applied to the end
9085 // of a region, so generate a separate map type array in that case.
9086 if (Info.separateBeginEndCalls()) {
9087 bool EndMapTypesDiffer = false;
9088 for (uint64_t &Type : Mapping) {
9089 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9090 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9091 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9092 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9093 EndMapTypesDiffer = true;
9094 }
9095 }
9096 if (EndMapTypesDiffer) {
9097 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9098 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9099 }
9100 }
9101
9102 PointerType *PtrTy = Builder.getPtrTy();
9103 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9104 Value *BPVal = CombinedInfo.BasePointers[I];
9105 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9106 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9107 0, I);
9108 Builder.CreateAlignedStore(BPVal, BP,
9109 M.getDataLayout().getPrefTypeAlign(PtrTy));
9110
9111 if (Info.requiresDevicePointerInfo()) {
9112 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9113 CodeGenIP = Builder.saveIP();
9114 Builder.restoreIP(AllocaIP);
9115 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9116 Builder.restoreIP(CodeGenIP);
9117 if (DeviceAddrCB)
9118 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9119 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9120 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9121 if (DeviceAddrCB)
9122 DeviceAddrCB(I, BP);
9123 }
9124 }
9125
9126 Value *PVal = CombinedInfo.Pointers[I];
9127 Value *P = Builder.CreateConstInBoundsGEP2_32(
9128 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9129 I);
9130 // TODO: Check alignment correct.
9131 Builder.CreateAlignedStore(PVal, P,
9132 M.getDataLayout().getPrefTypeAlign(PtrTy));
9133
9134 if (RuntimeSizes.test(I)) {
9135 Value *S = Builder.CreateConstInBoundsGEP2_32(
9136 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9137 /*Idx0=*/0,
9138 /*Idx1=*/I);
9139 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9140 Int64Ty,
9141 /*isSigned=*/true),
9142 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9143 }
9144 // Fill up the mapper array.
9145 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9146 Value *MFunc = ConstantPointerNull::get(PtrTy);
9147
9148 auto CustomMFunc = CustomMapperCB(I);
9149 if (!CustomMFunc)
9150 return CustomMFunc.takeError();
9151 if (*CustomMFunc)
9152 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9153
9154 Value *MAddr = Builder.CreateInBoundsGEP(
9155 MappersArray->getAllocatedType(), MappersArray,
9156 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9157 Builder.CreateAlignedStore(
9158 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9159 }
9160
9161 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9162 Info.NumberOfPtrs == 0)
9163 return Error::success();
9164 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9165 return Error::success();
9166}
9167
9168void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9169 BasicBlock *CurBB = Builder.GetInsertBlock();
9170
9171 if (!CurBB || CurBB->getTerminator()) {
9172 // If there is no insert point or the previous block is already
9173 // terminated, don't touch it.
9174 } else {
9175 // Otherwise, create a fall-through branch.
9176 Builder.CreateBr(Target);
9177 }
9178
9179 Builder.ClearInsertionPoint();
9180}
9181
9182void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9183 bool IsFinished) {
9184 BasicBlock *CurBB = Builder.GetInsertBlock();
9185
9186 // Fall out of the current block (if necessary).
9187 emitBranch(BB);
9188
9189 if (IsFinished && BB->use_empty()) {
9190 BB->eraseFromParent();
9191 return;
9192 }
9193
9194 // Place the block after the current block, if possible, or else at
9195 // the end of the function.
9196 if (CurBB && CurBB->getParent())
9197 CurFn->insert(std::next(CurBB->getIterator()), BB);
9198 else
9199 CurFn->insert(CurFn->end(), BB);
9200 Builder.SetInsertPoint(BB);
9201}
9202
9203Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9204 BodyGenCallbackTy ElseGen,
9205 InsertPointTy AllocaIP) {
9206 // If the condition constant folds and can be elided, try to avoid emitting
9207 // the condition and the dead arm of the if/else.
9208 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9209 auto CondConstant = CI->getSExtValue();
9210 if (CondConstant)
9211 return ThenGen(AllocaIP, Builder.saveIP());
9212
9213 return ElseGen(AllocaIP, Builder.saveIP());
9214 }
9215
9216 Function *CurFn = Builder.GetInsertBlock()->getParent();
9217
9218 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9219 // emit the conditional branch.
9220 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9221 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9222 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9223 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9224 // Emit the 'then' code.
9225 emitBlock(ThenBlock, CurFn);
9226 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9227 return Err;
9228 emitBranch(ContBlock);
9229 // Emit the 'else' code if present.
9230 // There is no need to emit line number for unconditional branch.
9231 emitBlock(ElseBlock, CurFn);
9232 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9233 return Err;
9234 // There is no need to emit line number for unconditional branch.
9235 emitBranch(ContBlock);
9236 // Emit the continuation block for code after the if.
9237 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9238 return Error::success();
9239}
9240
9241bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9242 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9245 "Unexpected Atomic Ordering.");
9246
9247 bool Flush = false;
9249
9250 switch (AK) {
9251 case Read:
9254 FlushAO = AtomicOrdering::Acquire;
9255 Flush = true;
9256 }
9257 break;
9258 case Write:
9259 case Compare:
9260 case Update:
9263 FlushAO = AtomicOrdering::Release;
9264 Flush = true;
9265 }
9266 break;
9267 case Capture:
9268 switch (AO) {
9270 FlushAO = AtomicOrdering::Acquire;
9271 Flush = true;
9272 break;
9274 FlushAO = AtomicOrdering::Release;
9275 Flush = true;
9276 break;
9280 Flush = true;
9281 break;
9282 default:
9283 // do nothing - leave silently.
9284 break;
9285 }
9286 }
9287
9288 if (Flush) {
9289 // Currently Flush RT call still doesn't take memory_ordering, so for when
9290 // that happens, this tries to do the resolution of which atomic ordering
9291 // to use with but issue the flush call
9292 // TODO: pass `FlushAO` after memory ordering support is added
9293 (void)FlushAO;
9294 emitFlush(Loc);
9295 }
9296
9297 // for AO == AtomicOrdering::Monotonic and all other case combinations
9298 // do nothing
9299 return Flush;
9300}
9301
9302OpenMPIRBuilder::InsertPointTy
9303OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9304 AtomicOpValue &X, AtomicOpValue &V,
9305 AtomicOrdering AO, InsertPointTy AllocaIP) {
9306 if (!updateToLocation(Loc))
9307 return Loc.IP;
9308
9309 assert(X.Var->getType()->isPointerTy() &&
9310 "OMP Atomic expects a pointer to target memory");
9311 Type *XElemTy = X.ElemTy;
9312 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9313 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9314 "OMP atomic read expected a scalar type");
9315
9316 Value *XRead = nullptr;
9317
9318 if (XElemTy->isIntegerTy()) {
9319 LoadInst *XLD =
9320 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9321 XLD->setAtomic(AO);
9322 XRead = cast<Value>(XLD);
9323 } else if (XElemTy->isStructTy()) {
9324 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9325 // target does not support `atomicrmw` of the size of the struct
9326 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9327 OldVal->setAtomic(AO);
9328 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9329 unsigned LoadSize =
9330 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9331 OpenMPIRBuilder::AtomicInfo atomicInfo(
9332 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9333 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9334 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9335 XRead = AtomicLoadRes.first;
9336 OldVal->eraseFromParent();
9337 } else {
9338 // We need to perform atomic op as integer
9339 IntegerType *IntCastTy =
9340 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9341 LoadInst *XLoad =
9342 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9343 XLoad->setAtomic(AO);
9344 if (XElemTy->isFloatingPointTy()) {
9345 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9346 } else {
9347 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9348 }
9349 }
9350 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9351 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9352 return Builder.saveIP();
9353}
9354
9355OpenMPIRBuilder::InsertPointTy
9356OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9357 AtomicOpValue &X, Value *Expr,
9358 AtomicOrdering AO, InsertPointTy AllocaIP) {
9359 if (!updateToLocation(Loc))
9360 return Loc.IP;
9361
9362 assert(X.Var->getType()->isPointerTy() &&
9363 "OMP Atomic expects a pointer to target memory");
9364 Type *XElemTy = X.ElemTy;
9365 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9366 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9367 "OMP atomic write expected a scalar type");
9368
9369 if (XElemTy->isIntegerTy()) {
9370 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9371 XSt->setAtomic(AO);
9372 } else if (XElemTy->isStructTy()) {
9373 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9374 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9375 unsigned LoadSize =
9376 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9377 OpenMPIRBuilder::AtomicInfo atomicInfo(
9378 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9379 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9380 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9381 OldVal->eraseFromParent();
9382 } else {
9383 // We need to bitcast and perform atomic op as integers
9384 IntegerType *IntCastTy =
9385 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9386 Value *ExprCast =
9387 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9388 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9389 XSt->setAtomic(AO);
9390 }
9391
9392 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9393 return Builder.saveIP();
9394}
9395
9396OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9397 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9398 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9399 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9400 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9401 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9402 if (!updateToLocation(Loc))
9403 return Loc.IP;
9404
9405 LLVM_DEBUG({
9406 Type *XTy = X.Var->getType();
9407 assert(XTy->isPointerTy() &&
9408 "OMP Atomic expects a pointer to target memory");
9409 Type *XElemTy = X.ElemTy;
9410 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9411 XElemTy->isPointerTy()) &&
9412 "OMP atomic update expected a scalar type");
9413 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9414 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9415 "OpenMP atomic does not support LT or GT operations");
9416 });
9417
9418 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9419 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9420 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9421 if (!AtomicResult)
9422 return AtomicResult.takeError();
9423 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9424 return Builder.saveIP();
9425}
9426
9427// FIXME: Duplicating AtomicExpand
9428Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9429 AtomicRMWInst::BinOp RMWOp) {
9430 switch (RMWOp) {
9431 case AtomicRMWInst::Add:
9432 return Builder.CreateAdd(Src1, Src2);
9433 case AtomicRMWInst::Sub:
9434 return Builder.CreateSub(Src1, Src2);
9435 case AtomicRMWInst::And:
9436 return Builder.CreateAnd(Src1, Src2);
9438 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9439 case AtomicRMWInst::Or:
9440 return Builder.CreateOr(Src1, Src2);
9441 case AtomicRMWInst::Xor:
9442 return Builder.CreateXor(Src1, Src2);
9447 case AtomicRMWInst::Max:
9448 case AtomicRMWInst::Min:
9459 llvm_unreachable("Unsupported atomic update operation");
9460 }
9461 llvm_unreachable("Unsupported atomic update operation");
9462}
9463
9464Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9465 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9467 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9468 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9469 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9470 // or a complex datatype.
9471 bool emitRMWOp = false;
9472 switch (RMWOp) {
9473 case AtomicRMWInst::Add:
9474 case AtomicRMWInst::And:
9476 case AtomicRMWInst::Or:
9477 case AtomicRMWInst::Xor:
9479 emitRMWOp = XElemTy;
9480 break;
9481 case AtomicRMWInst::Sub:
9482 emitRMWOp = (IsXBinopExpr && XElemTy);
9483 break;
9484 default:
9485 emitRMWOp = false;
9486 }
9487 emitRMWOp &= XElemTy->isIntegerTy();
9488
9489 std::pair<Value *, Value *> Res;
9490 if (emitRMWOp) {
9491 AtomicRMWInst *RMWInst =
9492 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9493 if (T.isAMDGPU()) {
9494 if (IsIgnoreDenormalMode)
9495 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9496 llvm::MDNode::get(Builder.getContext(), {}));
9497 if (!IsFineGrainedMemory)
9498 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9499 llvm::MDNode::get(Builder.getContext(), {}));
9500 if (!IsRemoteMemory)
9501 RMWInst->setMetadata("amdgpu.no.remote.memory",
9502 llvm::MDNode::get(Builder.getContext(), {}));
9503 }
9504 Res.first = RMWInst;
9505 // not needed except in case of postfix captures. Generate anyway for
9506 // consistency with the else part. Will be removed with any DCE pass.
9507 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9508 if (RMWOp == AtomicRMWInst::Xchg)
9509 Res.second = Res.first;
9510 else
9511 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9512 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9513 XElemTy->isStructTy()) {
9514 LoadInst *OldVal =
9515 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9516 OldVal->setAtomic(AO);
9517 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9518 unsigned LoadSize =
9519 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9520
9521 OpenMPIRBuilder::AtomicInfo atomicInfo(
9522 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9523 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9524 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9525 BasicBlock *CurBB = Builder.GetInsertBlock();
9526 Instruction *CurBBTI = CurBB->getTerminator();
9527 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9528 BasicBlock *ExitBB =
9529 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9530 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9531 X->getName() + ".atomic.cont");
9532 ContBB->getTerminator()->eraseFromParent();
9533 Builder.restoreIP(AllocaIP);
9534 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9535 NewAtomicAddr->setName(X->getName() + "x.new.val");
9536 Builder.SetInsertPoint(ContBB);
9537 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9538 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9539 Value *OldExprVal = PHI;
9540 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9541 if (!CBResult)
9542 return CBResult.takeError();
9543 Value *Upd = *CBResult;
9544 Builder.CreateStore(Upd, NewAtomicAddr);
9547 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9548 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9549 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9550 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9551 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9552 OldVal->eraseFromParent();
9553 Res.first = OldExprVal;
9554 Res.second = Upd;
9555
9556 if (UnreachableInst *ExitTI =
9558 CurBBTI->eraseFromParent();
9559 Builder.SetInsertPoint(ExitBB);
9560 } else {
9561 Builder.SetInsertPoint(ExitTI);
9562 }
9563 } else {
9564 IntegerType *IntCastTy =
9565 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9566 LoadInst *OldVal =
9567 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9568 OldVal->setAtomic(AO);
9569 // CurBB
9570 // | /---\
9571 // ContBB |
9572 // | \---/
9573 // ExitBB
9574 BasicBlock *CurBB = Builder.GetInsertBlock();
9575 Instruction *CurBBTI = CurBB->getTerminator();
9576 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9577 BasicBlock *ExitBB =
9578 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9579 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9580 X->getName() + ".atomic.cont");
9581 ContBB->getTerminator()->eraseFromParent();
9582 Builder.restoreIP(AllocaIP);
9583 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9584 NewAtomicAddr->setName(X->getName() + "x.new.val");
9585 Builder.SetInsertPoint(ContBB);
9586 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9587 PHI->addIncoming(OldVal, CurBB);
9588 bool IsIntTy = XElemTy->isIntegerTy();
9589 Value *OldExprVal = PHI;
9590 if (!IsIntTy) {
9591 if (XElemTy->isFloatingPointTy()) {
9592 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9593 X->getName() + ".atomic.fltCast");
9594 } else {
9595 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9596 X->getName() + ".atomic.ptrCast");
9597 }
9598 }
9599
9600 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9601 if (!CBResult)
9602 return CBResult.takeError();
9603 Value *Upd = *CBResult;
9604 Builder.CreateStore(Upd, NewAtomicAddr);
9605 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9608 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9609 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9610 Result->setVolatile(VolatileX);
9611 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9612 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9613 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9614 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9615
9616 Res.first = OldExprVal;
9617 Res.second = Upd;
9618
9619 // set Insertion point in exit block
9620 if (UnreachableInst *ExitTI =
9622 CurBBTI->eraseFromParent();
9623 Builder.SetInsertPoint(ExitBB);
9624 } else {
9625 Builder.SetInsertPoint(ExitTI);
9626 }
9627 }
9628
9629 return Res;
9630}
9631
9632OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9633 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9634 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9635 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9636 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9637 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9638 if (!updateToLocation(Loc))
9639 return Loc.IP;
9640
9641 LLVM_DEBUG({
9642 Type *XTy = X.Var->getType();
9643 assert(XTy->isPointerTy() &&
9644 "OMP Atomic expects a pointer to target memory");
9645 Type *XElemTy = X.ElemTy;
9646 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9647 XElemTy->isPointerTy()) &&
9648 "OMP atomic capture expected a scalar type");
9649 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9650 "OpenMP atomic does not support LT or GT operations");
9651 });
9652
9653 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9654 // 'x' is simply atomically rewritten with 'expr'.
9655 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9656 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9657 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9658 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9659 if (!AtomicResult)
9660 return AtomicResult.takeError();
9661 Value *CapturedVal =
9662 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9663 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9664
9665 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9666 return Builder.saveIP();
9667}
9668
9669OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9670 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9671 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9672 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9673 bool IsFailOnly) {
9674
9676 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9677 IsPostfixUpdate, IsFailOnly, Failure);
9678}
9679
9680OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9681 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9682 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9683 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9684 bool IsFailOnly, AtomicOrdering Failure) {
9685
9686 if (!updateToLocation(Loc))
9687 return Loc.IP;
9688
9689 assert(X.Var->getType()->isPointerTy() &&
9690 "OMP atomic expects a pointer to target memory");
9691 // compare capture
9692 if (V.Var) {
9693 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9694 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9695 }
9696
9697 bool IsInteger = E->getType()->isIntegerTy();
9698
9699 if (Op == OMPAtomicCompareOp::EQ) {
9700 AtomicCmpXchgInst *Result = nullptr;
9701 if (!IsInteger) {
9702 IntegerType *IntCastTy =
9703 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9704 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9705 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9706 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9707 AO, Failure);
9708 } else {
9709 Result =
9710 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9711 }
9712
9713 if (V.Var) {
9714 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9715 if (!IsInteger)
9716 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9717 assert(OldValue->getType() == V.ElemTy &&
9718 "OldValue and V must be of same type");
9719 if (IsPostfixUpdate) {
9720 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9721 } else {
9722 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9723 if (IsFailOnly) {
9724 // CurBB----
9725 // | |
9726 // v |
9727 // ContBB |
9728 // | |
9729 // v |
9730 // ExitBB <-
9731 //
9732 // where ContBB only contains the store of old value to 'v'.
9733 BasicBlock *CurBB = Builder.GetInsertBlock();
9734 Instruction *CurBBTI = CurBB->getTerminator();
9735 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9736 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9737 CurBBTI, X.Var->getName() + ".atomic.exit");
9738 BasicBlock *ContBB = CurBB->splitBasicBlock(
9739 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9740 ContBB->getTerminator()->eraseFromParent();
9741 CurBB->getTerminator()->eraseFromParent();
9742
9743 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9744
9745 Builder.SetInsertPoint(ContBB);
9746 Builder.CreateStore(OldValue, V.Var);
9747 Builder.CreateBr(ExitBB);
9748
9749 if (UnreachableInst *ExitTI =
9751 CurBBTI->eraseFromParent();
9752 Builder.SetInsertPoint(ExitBB);
9753 } else {
9754 Builder.SetInsertPoint(ExitTI);
9755 }
9756 } else {
9757 Value *CapturedValue =
9758 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9759 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9760 }
9761 }
9762 }
9763 // The comparison result has to be stored.
9764 if (R.Var) {
9765 assert(R.Var->getType()->isPointerTy() &&
9766 "r.var must be of pointer type");
9767 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9768
9769 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9770 Value *ResultCast = R.IsSigned
9771 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9772 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9773 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9774 }
9775 } else {
9776 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9777 "Op should be either max or min at this point");
9778 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9779
9780 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9781 // Let's take max as example.
9782 // OpenMP form:
9783 // x = x > expr ? expr : x;
9784 // LLVM form:
9785 // *ptr = *ptr > val ? *ptr : val;
9786 // We need to transform to LLVM form.
9787 // x = x <= expr ? x : expr;
9789 if (IsXBinopExpr) {
9790 if (IsInteger) {
9791 if (X.IsSigned)
9792 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9794 else
9795 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9797 } else {
9798 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9800 }
9801 } else {
9802 if (IsInteger) {
9803 if (X.IsSigned)
9804 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9806 else
9807 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9809 } else {
9810 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9812 }
9813 }
9814
9815 AtomicRMWInst *OldValue =
9816 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9817 if (V.Var) {
9818 Value *CapturedValue = nullptr;
9819 if (IsPostfixUpdate) {
9820 CapturedValue = OldValue;
9821 } else {
9822 CmpInst::Predicate Pred;
9823 switch (NewOp) {
9824 case AtomicRMWInst::Max:
9825 Pred = CmpInst::ICMP_SGT;
9826 break;
9828 Pred = CmpInst::ICMP_UGT;
9829 break;
9831 Pred = CmpInst::FCMP_OGT;
9832 break;
9833 case AtomicRMWInst::Min:
9834 Pred = CmpInst::ICMP_SLT;
9835 break;
9837 Pred = CmpInst::ICMP_ULT;
9838 break;
9840 Pred = CmpInst::FCMP_OLT;
9841 break;
9842 default:
9843 llvm_unreachable("unexpected comparison op");
9844 }
9845 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9846 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9847 }
9848 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9849 }
9850 }
9851
9852 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9853
9854 return Builder.saveIP();
9855}
9856
9857OpenMPIRBuilder::InsertPointOrErrorTy
9858OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9859 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9860 Value *NumTeamsUpper, Value *ThreadLimit,
9861 Value *IfExpr) {
9862 if (!updateToLocation(Loc))
9863 return InsertPointTy();
9864
9865 uint32_t SrcLocStrSize;
9866 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9867 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9868 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9869
9870 // Outer allocation basicblock is the entry block of the current function.
9871 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9872 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9873 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9874 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9875 }
9876
9877 // The current basic block is split into four basic blocks. After outlining,
9878 // they will be mapped as follows:
9879 // ```
9880 // def current_fn() {
9881 // current_basic_block:
9882 // br label %teams.exit
9883 // teams.exit:
9884 // ; instructions after teams
9885 // }
9886 //
9887 // def outlined_fn() {
9888 // teams.alloca:
9889 // br label %teams.body
9890 // teams.body:
9891 // ; instructions within teams body
9892 // }
9893 // ```
9894 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9895 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9896 BasicBlock *AllocaBB =
9897 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9898
9899 bool SubClausesPresent =
9900 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9901 // Push num_teams
9902 if (!Config.isTargetDevice() && SubClausesPresent) {
9903 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9904 "if lowerbound is non-null, then upperbound must also be non-null "
9905 "for bounds on num_teams");
9906
9907 if (NumTeamsUpper == nullptr)
9908 NumTeamsUpper = Builder.getInt32(0);
9909
9910 if (NumTeamsLower == nullptr)
9911 NumTeamsLower = NumTeamsUpper;
9912
9913 if (IfExpr) {
9914 assert(IfExpr->getType()->isIntegerTy() &&
9915 "argument to if clause must be an integer value");
9916
9917 // upper = ifexpr ? upper : 1
9918 if (IfExpr->getType() != Int1)
9919 IfExpr = Builder.CreateICmpNE(IfExpr,
9920 ConstantInt::get(IfExpr->getType(), 0));
9921 NumTeamsUpper = Builder.CreateSelect(
9922 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9923
9924 // lower = ifexpr ? lower : 1
9925 NumTeamsLower = Builder.CreateSelect(
9926 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9927 }
9928
9929 if (ThreadLimit == nullptr)
9930 ThreadLimit = Builder.getInt32(0);
9931
9932 Value *ThreadNum = getOrCreateThreadID(Ident);
9933 Builder.CreateCall(
9934 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9935 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9936 }
9937 // Generate the body of teams.
9938 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9939 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9940 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9941 return Err;
9942
9943 OutlineInfo OI;
9944 OI.EntryBB = AllocaBB;
9945 OI.ExitBB = ExitBB;
9946 OI.OuterAllocaBB = &OuterAllocaBB;
9947
9948 // Insert fake values for global tid and bound tid.
9950 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9951 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9952 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9953 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9954 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9955
9956 auto HostPostOutlineCB = [this, Ident,
9957 ToBeDeleted](Function &OutlinedFn) mutable {
9958 // The stale call instruction will be replaced with a new call instruction
9959 // for runtime call with the outlined function.
9960
9961 assert(OutlinedFn.hasOneUse() &&
9962 "there must be a single user for the outlined function");
9963 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9964 ToBeDeleted.push_back(StaleCI);
9965
9966 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9967 "Outlined function must have two or three arguments only");
9968
9969 bool HasShared = OutlinedFn.arg_size() == 3;
9970
9971 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9972 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9973 if (HasShared)
9974 OutlinedFn.getArg(2)->setName("data");
9975
9976 // Call to the runtime function for teams in the current function.
9977 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9978 "outlined function.");
9979 Builder.SetInsertPoint(StaleCI);
9981 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9982 if (HasShared)
9983 Args.push_back(StaleCI->getArgOperand(2));
9984 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9985 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9986 Args);
9987
9988 for (Instruction *I : llvm::reverse(ToBeDeleted))
9989 I->eraseFromParent();
9990 };
9991
9992 if (!Config.isTargetDevice())
9993 OI.PostOutlineCB = HostPostOutlineCB;
9994
9995 addOutlineInfo(std::move(OI));
9996
9997 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9998
9999 return Builder.saveIP();
10000}
10001
10002OpenMPIRBuilder::InsertPointOrErrorTy
10003OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10004 InsertPointTy OuterAllocaIP,
10005 BodyGenCallbackTy BodyGenCB) {
10006 if (!updateToLocation(Loc))
10007 return InsertPointTy();
10008
10009 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10010
10011 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10012 BasicBlock *BodyBB =
10013 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10014 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10015 }
10016 BasicBlock *ExitBB =
10017 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10018 BasicBlock *BodyBB =
10019 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10020 BasicBlock *AllocaBB =
10021 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10022
10023 // Generate the body of distribute clause
10024 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10025 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10026 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10027 return Err;
10028
10029 OutlineInfo OI;
10030 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10031 OI.EntryBB = AllocaBB;
10032 OI.ExitBB = ExitBB;
10033
10034 addOutlineInfo(std::move(OI));
10035 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10036
10037 return Builder.saveIP();
10038}
10039
10041OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10042 std::string VarName) {
10043 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10045 Names.size()),
10046 Names);
10047 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10048 M, MapNamesArrayInit->getType(),
10049 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10050 VarName);
10051 return MapNamesArrayGlobal;
10052}
10053
10054// Create all simple and struct types exposed by the runtime and remember
10055// the llvm::PointerTypes of them for easy access later.
10056void OpenMPIRBuilder::initializeTypes(Module &M) {
10057 LLVMContext &Ctx = M.getContext();
10058 StructType *T;
10059 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10060#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10061#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10062 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10063 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10064#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10065 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10066 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10067#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10068 T = StructType::getTypeByName(Ctx, StructName); \
10069 if (!T) \
10070 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10071 VarName = T; \
10072 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10073#include "llvm/Frontend/OpenMP/OMPKinds.def"
10074}
10075
10076void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10078 SmallVectorImpl<BasicBlock *> &BlockVector) {
10080 BlockSet.insert(EntryBB);
10081 BlockSet.insert(ExitBB);
10082
10083 Worklist.push_back(EntryBB);
10084 while (!Worklist.empty()) {
10085 BasicBlock *BB = Worklist.pop_back_val();
10086 BlockVector.push_back(BB);
10087 for (BasicBlock *SuccBB : successors(BB))
10088 if (BlockSet.insert(SuccBB).second)
10089 Worklist.push_back(SuccBB);
10090 }
10091}
10092
10093void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10094 uint64_t Size, int32_t Flags,
10096 StringRef Name) {
10097 if (!Config.isGPU()) {
10100 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10101 return;
10102 }
10103 // TODO: Add support for global variables on the device after declare target
10104 // support.
10105 Function *Fn = dyn_cast<Function>(Addr);
10106 if (!Fn)
10107 return;
10108
10109 // Add a function attribute for the kernel.
10110 Fn->addFnAttr("kernel");
10111 if (T.isAMDGCN())
10112 Fn->addFnAttr("uniform-work-group-size", "true");
10113 Fn->addFnAttr(Attribute::MustProgress);
10114}
10115
10116// We only generate metadata for function that contain target regions.
10117void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10118 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10119
10120 // If there are no entries, we don't need to do anything.
10121 if (OffloadInfoManager.empty())
10122 return;
10123
10124 LLVMContext &C = M.getContext();
10125 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10126 TargetRegionEntryInfo>,
10127 16>
10128 OrderedEntries(OffloadInfoManager.size());
10129
10130 // Auxiliary methods to create metadata values and strings.
10131 auto &&GetMDInt = [this](unsigned V) {
10132 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10133 };
10134
10135 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10136
10137 // Create the offloading info metadata node.
10138 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10139 auto &&TargetRegionMetadataEmitter =
10140 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10141 const TargetRegionEntryInfo &EntryInfo,
10142 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10143 // Generate metadata for target regions. Each entry of this metadata
10144 // contains:
10145 // - Entry 0 -> Kind of this type of metadata (0).
10146 // - Entry 1 -> Device ID of the file where the entry was identified.
10147 // - Entry 2 -> File ID of the file where the entry was identified.
10148 // - Entry 3 -> Mangled name of the function where the entry was
10149 // identified.
10150 // - Entry 4 -> Line in the file where the entry was identified.
10151 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10152 // - Entry 6 -> Order the entry was created.
10153 // The first element of the metadata node is the kind.
10154 Metadata *Ops[] = {
10155 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10156 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10157 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10158 GetMDInt(E.getOrder())};
10159
10160 // Save this entry in the right position of the ordered entries array.
10161 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10162
10163 // Add metadata to the named metadata node.
10164 MD->addOperand(MDNode::get(C, Ops));
10165 };
10166
10167 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10168
10169 // Create function that emits metadata for each device global variable entry;
10170 auto &&DeviceGlobalVarMetadataEmitter =
10171 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10172 StringRef MangledName,
10173 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10174 // Generate metadata for global variables. Each entry of this metadata
10175 // contains:
10176 // - Entry 0 -> Kind of this type of metadata (1).
10177 // - Entry 1 -> Mangled name of the variable.
10178 // - Entry 2 -> Declare target kind.
10179 // - Entry 3 -> Order the entry was created.
10180 // The first element of the metadata node is the kind.
10181 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10182 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10183
10184 // Save this entry in the right position of the ordered entries array.
10185 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10186 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10187
10188 // Add metadata to the named metadata node.
10189 MD->addOperand(MDNode::get(C, Ops));
10190 };
10191
10192 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10193 DeviceGlobalVarMetadataEmitter);
10194
10195 for (const auto &E : OrderedEntries) {
10196 assert(E.first && "All ordered entries must exist!");
10197 if (const auto *CE =
10199 E.first)) {
10200 if (!CE->getID() || !CE->getAddress()) {
10201 // Do not blame the entry if the parent funtion is not emitted.
10202 TargetRegionEntryInfo EntryInfo = E.second;
10203 StringRef FnName = EntryInfo.ParentName;
10204 if (!M.getNamedValue(FnName))
10205 continue;
10206 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10207 continue;
10208 }
10209 createOffloadEntry(CE->getID(), CE->getAddress(),
10210 /*Size=*/0, CE->getFlags(),
10212 } else if (const auto *CE = dyn_cast<
10213 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10214 E.first)) {
10215 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10216 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10217 CE->getFlags());
10218 switch (Flags) {
10219 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10220 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10221 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10222 continue;
10223 if (!CE->getAddress()) {
10224 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10225 continue;
10226 }
10227 // The vaiable has no definition - no need to add the entry.
10228 if (CE->getVarSize() == 0)
10229 continue;
10230 break;
10231 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10232 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10233 (!Config.isTargetDevice() && CE->getAddress())) &&
10234 "Declaret target link address is set.");
10235 if (Config.isTargetDevice())
10236 continue;
10237 if (!CE->getAddress()) {
10238 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10239 continue;
10240 }
10241 break;
10242 default:
10243 break;
10244 }
10245
10246 // Hidden or internal symbols on the device are not externally visible.
10247 // We should not attempt to register them by creating an offloading
10248 // entry. Indirect variables are handled separately on the device.
10249 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10250 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10251 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10252 continue;
10253
10254 // Indirect globals need to use a special name that doesn't match the name
10255 // of the associated host global.
10256 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10257 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10258 Flags, CE->getLinkage(), CE->getVarName());
10259 else
10260 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10261 Flags, CE->getLinkage());
10262
10263 } else {
10264 llvm_unreachable("Unsupported entry kind.");
10265 }
10266 }
10267
10268 // Emit requires directive globals to a special entry so the runtime can
10269 // register them when the device image is loaded.
10270 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10271 // entries should be redesigned to better suit this use-case.
10272 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10276 ".requires", /*Size=*/0,
10277 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10278 Config.getRequiresFlags());
10279}
10280
10281void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10282 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10283 unsigned FileID, unsigned Line, unsigned Count) {
10284 raw_svector_ostream OS(Name);
10285 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10286 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10287 if (Count)
10288 OS << "_" << Count;
10289}
10290
10291void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10292 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10293 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10294 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10295 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10296 EntryInfo.Line, NewCount);
10297}
10298
10299TargetRegionEntryInfo
10300OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10301 StringRef ParentName) {
10302 sys::fs::UniqueID ID(0xdeadf17e, 0);
10303 auto FileIDInfo = CallBack();
10304 uint64_t FileID = 0;
10305 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10306 // If the inode ID could not be determined, create a hash value
10307 // the current file name and use that as an ID.
10308 if (EC)
10309 FileID = hash_value(std::get<0>(FileIDInfo));
10310 else
10311 FileID = ID.getFile();
10312
10313 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10314 std::get<1>(FileIDInfo));
10315}
10316
10317unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10318 unsigned Offset = 0;
10319 for (uint64_t Remain =
10320 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10322 !(Remain & 1); Remain = Remain >> 1)
10323 Offset++;
10324 return Offset;
10325}
10326
10328OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10329 // Rotate by getFlagMemberOffset() bits.
10330 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10331 << getFlagMemberOffset());
10332}
10333
10334void OpenMPIRBuilder::setCorrectMemberOfFlag(
10336 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10337 // If the entry is PTR_AND_OBJ but has not been marked with the special
10338 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10339 // marked as MEMBER_OF.
10340 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10342 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10345 return;
10346
10347 // Reset the placeholder value to prepare the flag for the assignment of the
10348 // proper MEMBER_OF value.
10349 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10350 Flags |= MemberOfFlag;
10351}
10352
10353Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10354 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10355 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10356 bool IsDeclaration, bool IsExternallyVisible,
10357 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10358 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10359 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10360 std::function<Constant *()> GlobalInitializer,
10361 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10362 // TODO: convert this to utilise the IRBuilder Config rather than
10363 // a passed down argument.
10364 if (OpenMPSIMD)
10365 return nullptr;
10366
10367 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10368 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10369 CaptureClause ==
10370 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10371 Config.hasRequiresUnifiedSharedMemory())) {
10372 SmallString<64> PtrName;
10373 {
10374 raw_svector_ostream OS(PtrName);
10375 OS << MangledName;
10376 if (!IsExternallyVisible)
10377 OS << format("_%x", EntryInfo.FileID);
10378 OS << "_decl_tgt_ref_ptr";
10379 }
10380
10381 Value *Ptr = M.getNamedValue(PtrName);
10382
10383 if (!Ptr) {
10384 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10385 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10386
10387 auto *GV = cast<GlobalVariable>(Ptr);
10388 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10389
10390 if (!Config.isTargetDevice()) {
10391 if (GlobalInitializer)
10392 GV->setInitializer(GlobalInitializer());
10393 else
10394 GV->setInitializer(GlobalValue);
10395 }
10396
10397 registerTargetGlobalVariable(
10398 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10399 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10400 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10401 }
10402
10403 return cast<Constant>(Ptr);
10404 }
10405
10406 return nullptr;
10407}
10408
10409void OpenMPIRBuilder::registerTargetGlobalVariable(
10410 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10411 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10412 bool IsDeclaration, bool IsExternallyVisible,
10413 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10414 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10415 std::vector<Triple> TargetTriple,
10416 std::function<Constant *()> GlobalInitializer,
10417 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10418 Constant *Addr) {
10419 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10420 (TargetTriple.empty() && !Config.isTargetDevice()))
10421 return;
10422
10423 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10425 int64_t VarSize;
10427
10428 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10429 CaptureClause ==
10430 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10431 !Config.hasRequiresUnifiedSharedMemory()) {
10432 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10433 VarName = MangledName;
10434 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10435
10436 if (!IsDeclaration)
10437 VarSize = divideCeil(
10438 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10439 else
10440 VarSize = 0;
10441 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10442
10443 // This is a workaround carried over from Clang which prevents undesired
10444 // optimisation of internal variables.
10445 if (Config.isTargetDevice() &&
10446 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10447 // Do not create a "ref-variable" if the original is not also available
10448 // on the host.
10449 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10450 return;
10451
10452 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10453
10454 if (!M.getNamedValue(RefName)) {
10455 Constant *AddrRef =
10456 getOrCreateInternalVariable(Addr->getType(), RefName);
10457 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10458 GvAddrRef->setConstant(true);
10459 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10460 GvAddrRef->setInitializer(Addr);
10461 GeneratedRefs.push_back(GvAddrRef);
10462 }
10463 }
10464 } else {
10465 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10466 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10467 else
10468 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10469
10470 if (Config.isTargetDevice()) {
10471 VarName = (Addr) ? Addr->getName() : "";
10472 Addr = nullptr;
10473 } else {
10474 Addr = getAddrOfDeclareTargetVar(
10475 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10476 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10477 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10478 VarName = (Addr) ? Addr->getName() : "";
10479 }
10480 VarSize = M.getDataLayout().getPointerSize();
10482 }
10483
10484 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10485 Flags, Linkage);
10486}
10487
10488/// Loads all the offload entries information from the host IR
10489/// metadata.
10490void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10491 // If we are in target mode, load the metadata from the host IR. This code has
10492 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10493
10494 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10495 if (!MD)
10496 return;
10497
10498 for (MDNode *MN : MD->operands()) {
10499 auto &&GetMDInt = [MN](unsigned Idx) {
10500 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10501 return cast<ConstantInt>(V->getValue())->getZExtValue();
10502 };
10503
10504 auto &&GetMDString = [MN](unsigned Idx) {
10505 auto *V = cast<MDString>(MN->getOperand(Idx));
10506 return V->getString();
10507 };
10508
10509 switch (GetMDInt(0)) {
10510 default:
10511 llvm_unreachable("Unexpected metadata!");
10512 break;
10513 case OffloadEntriesInfoManager::OffloadEntryInfo::
10514 OffloadingEntryInfoTargetRegion: {
10515 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10516 /*DeviceID=*/GetMDInt(1),
10517 /*FileID=*/GetMDInt(2),
10518 /*Line=*/GetMDInt(4),
10519 /*Count=*/GetMDInt(5));
10520 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10521 /*Order=*/GetMDInt(6));
10522 break;
10523 }
10524 case OffloadEntriesInfoManager::OffloadEntryInfo::
10525 OffloadingEntryInfoDeviceGlobalVar:
10526 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10527 /*MangledName=*/GetMDString(1),
10528 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10529 /*Flags=*/GetMDInt(2)),
10530 /*Order=*/GetMDInt(3));
10531 break;
10532 }
10533 }
10534}
10535
10536void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10537 StringRef HostFilePath) {
10538 if (HostFilePath.empty())
10539 return;
10540
10541 auto Buf = VFS.getBufferForFile(HostFilePath);
10542 if (std::error_code Err = Buf.getError()) {
10543 report_fatal_error(("error opening host file from host file path inside of "
10544 "OpenMPIRBuilder: " +
10545 Err.message())
10546 .c_str());
10547 }
10548
10549 LLVMContext Ctx;
10551 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10552 if (std::error_code Err = M.getError()) {
10554 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10555 .c_str());
10556 }
10557
10558 loadOffloadInfoMetadata(*M.get());
10559}
10560
10561//===----------------------------------------------------------------------===//
10562// OffloadEntriesInfoManager
10563//===----------------------------------------------------------------------===//
10564
10565bool OffloadEntriesInfoManager::empty() const {
10566 return OffloadEntriesTargetRegion.empty() &&
10567 OffloadEntriesDeviceGlobalVar.empty();
10568}
10569
10570unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10571 const TargetRegionEntryInfo &EntryInfo) const {
10572 auto It = OffloadEntriesTargetRegionCount.find(
10573 getTargetRegionEntryCountKey(EntryInfo));
10574 if (It == OffloadEntriesTargetRegionCount.end())
10575 return 0;
10576 return It->second;
10577}
10578
10579void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10580 const TargetRegionEntryInfo &EntryInfo) {
10581 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10582 EntryInfo.Count + 1;
10583}
10584
10585/// Initialize target region entry.
10586void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10587 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10588 OffloadEntriesTargetRegion[EntryInfo] =
10589 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10590 OMPTargetRegionEntryTargetRegion);
10591 ++OffloadingEntriesNum;
10592}
10593
10594void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10595 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10596 OMPTargetRegionEntryKind Flags) {
10597 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10598
10599 // Update the EntryInfo with the next available count for this location.
10600 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10601
10602 // If we are emitting code for a target, the entry is already initialized,
10603 // only has to be registered.
10604 if (OMPBuilder->Config.isTargetDevice()) {
10605 // This could happen if the device compilation is invoked standalone.
10606 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10607 return;
10608 }
10609 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10610 Entry.setAddress(Addr);
10611 Entry.setID(ID);
10612 Entry.setFlags(Flags);
10613 } else {
10614 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10615 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10616 return;
10617 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10618 "Target region entry already registered!");
10619 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10620 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10621 ++OffloadingEntriesNum;
10622 }
10623 incrementTargetRegionEntryInfoCount(EntryInfo);
10624}
10625
10626bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10627 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10628
10629 // Update the EntryInfo with the next available count for this location.
10630 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10631
10632 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10633 if (It == OffloadEntriesTargetRegion.end()) {
10634 return false;
10635 }
10636 // Fail if this entry is already registered.
10637 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10638 return false;
10639 return true;
10640}
10641
10642void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10643 const OffloadTargetRegionEntryInfoActTy &Action) {
10644 // Scan all target region entries and perform the provided action.
10645 for (const auto &It : OffloadEntriesTargetRegion) {
10646 Action(It.first, It.second);
10647 }
10648}
10649
10650void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10651 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10652 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10653 ++OffloadingEntriesNum;
10654}
10655
10656void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10657 StringRef VarName, Constant *Addr, int64_t VarSize,
10658 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10659 if (OMPBuilder->Config.isTargetDevice()) {
10660 // This could happen if the device compilation is invoked standalone.
10661 if (!hasDeviceGlobalVarEntryInfo(VarName))
10662 return;
10663 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10664 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10665 if (Entry.getVarSize() == 0) {
10666 Entry.setVarSize(VarSize);
10667 Entry.setLinkage(Linkage);
10668 }
10669 return;
10670 }
10671 Entry.setVarSize(VarSize);
10672 Entry.setLinkage(Linkage);
10673 Entry.setAddress(Addr);
10674 } else {
10675 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10676 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10677 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10678 "Entry not initialized!");
10679 if (Entry.getVarSize() == 0) {
10680 Entry.setVarSize(VarSize);
10681 Entry.setLinkage(Linkage);
10682 }
10683 return;
10684 }
10685 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10686 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10687 Addr, VarSize, Flags, Linkage,
10688 VarName.str());
10689 else
10690 OffloadEntriesDeviceGlobalVar.try_emplace(
10691 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10692 ++OffloadingEntriesNum;
10693 }
10694}
10695
10696void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10697 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10698 // Scan all target region entries and perform the provided action.
10699 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10700 Action(E.getKey(), E.getValue());
10701}
10702
10703//===----------------------------------------------------------------------===//
10704// CanonicalLoopInfo
10705//===----------------------------------------------------------------------===//
10706
10707void CanonicalLoopInfo::collectControlBlocks(
10709 // We only count those BBs as control block for which we do not need to
10710 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10711 // flow. For consistency, this also means we do not add the Body block, which
10712 // is just the entry to the body code.
10713 BBs.reserve(BBs.size() + 6);
10714 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10715}
10716
10717BasicBlock *CanonicalLoopInfo::getPreheader() const {
10718 assert(isValid() && "Requires a valid canonical loop");
10719 for (BasicBlock *Pred : predecessors(Header)) {
10720 if (Pred != Latch)
10721 return Pred;
10722 }
10723 llvm_unreachable("Missing preheader");
10724}
10725
10726void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10727 assert(isValid() && "Requires a valid canonical loop");
10728
10729 Instruction *CmpI = &getCond()->front();
10730 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10731 CmpI->setOperand(1, TripCount);
10732
10733#ifndef NDEBUG
10734 assertOK();
10735#endif
10736}
10737
10738void CanonicalLoopInfo::mapIndVar(
10739 llvm::function_ref<Value *(Instruction *)> Updater) {
10740 assert(isValid() && "Requires a valid canonical loop");
10741
10742 Instruction *OldIV = getIndVar();
10743
10744 // Record all uses excluding those introduced by the updater. Uses by the
10745 // CanonicalLoopInfo itself to keep track of the number of iterations are
10746 // excluded.
10747 SmallVector<Use *> ReplacableUses;
10748 for (Use &U : OldIV->uses()) {
10749 auto *User = dyn_cast<Instruction>(U.getUser());
10750 if (!User)
10751 continue;
10752 if (User->getParent() == getCond())
10753 continue;
10754 if (User->getParent() == getLatch())
10755 continue;
10756 ReplacableUses.push_back(&U);
10757 }
10758
10759 // Run the updater that may introduce new uses
10760 Value *NewIV = Updater(OldIV);
10761
10762 // Replace the old uses with the value returned by the updater.
10763 for (Use *U : ReplacableUses)
10764 U->set(NewIV);
10765
10766#ifndef NDEBUG
10767 assertOK();
10768#endif
10769}
10770
10771void CanonicalLoopInfo::assertOK() const {
10772#ifndef NDEBUG
10773 // No constraints if this object currently does not describe a loop.
10774 if (!isValid())
10775 return;
10776
10777 BasicBlock *Preheader = getPreheader();
10778 BasicBlock *Body = getBody();
10779 BasicBlock *After = getAfter();
10780
10781 // Verify standard control-flow we use for OpenMP loops.
10782 assert(Preheader);
10783 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10784 "Preheader must terminate with unconditional branch");
10785 assert(Preheader->getSingleSuccessor() == Header &&
10786 "Preheader must jump to header");
10787
10788 assert(Header);
10789 assert(isa<BranchInst>(Header->getTerminator()) &&
10790 "Header must terminate with unconditional branch");
10791 assert(Header->getSingleSuccessor() == Cond &&
10792 "Header must jump to exiting block");
10793
10794 assert(Cond);
10795 assert(Cond->getSinglePredecessor() == Header &&
10796 "Exiting block only reachable from header");
10797
10798 assert(isa<BranchInst>(Cond->getTerminator()) &&
10799 "Exiting block must terminate with conditional branch");
10800 assert(size(successors(Cond)) == 2 &&
10801 "Exiting block must have two successors");
10802 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10803 "Exiting block's first successor jump to the body");
10804 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10805 "Exiting block's second successor must exit the loop");
10806
10807 assert(Body);
10808 assert(Body->getSinglePredecessor() == Cond &&
10809 "Body only reachable from exiting block");
10810 assert(!isa<PHINode>(Body->front()));
10811
10812 assert(Latch);
10814 "Latch must terminate with unconditional branch");
10815 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10816 // TODO: To support simple redirecting of the end of the body code that has
10817 // multiple; introduce another auxiliary basic block like preheader and after.
10818 assert(Latch->getSinglePredecessor() != nullptr);
10819 assert(!isa<PHINode>(Latch->front()));
10820
10821 assert(Exit);
10822 assert(isa<BranchInst>(Exit->getTerminator()) &&
10823 "Exit block must terminate with unconditional branch");
10824 assert(Exit->getSingleSuccessor() == After &&
10825 "Exit block must jump to after block");
10826
10827 assert(After);
10828 assert(After->getSinglePredecessor() == Exit &&
10829 "After block only reachable from exit block");
10830 assert(After->empty() || !isa<PHINode>(After->front()));
10831
10832 Instruction *IndVar = getIndVar();
10833 assert(IndVar && "Canonical induction variable not found?");
10834 assert(isa<IntegerType>(IndVar->getType()) &&
10835 "Induction variable must be an integer");
10836 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10837 "Induction variable must be a PHI in the loop header");
10838 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10839 assert(
10840 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10841 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10842
10843 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10844 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10845 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10846 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10847 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10848 ->isOne());
10849
10850 Value *TripCount = getTripCount();
10851 assert(TripCount && "Loop trip count not found?");
10852 assert(IndVar->getType() == TripCount->getType() &&
10853 "Trip count and induction variable must have the same type");
10854
10855 auto *CmpI = cast<CmpInst>(&Cond->front());
10856 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10857 "Exit condition must be a signed less-than comparison");
10858 assert(CmpI->getOperand(0) == IndVar &&
10859 "Exit condition must compare the induction variable");
10860 assert(CmpI->getOperand(1) == TripCount &&
10861 "Exit condition must compare with the trip count");
10862#endif
10863}
10864
10865void CanonicalLoopInfo::invalidate() {
10866 Header = nullptr;
10867 Cond = nullptr;
10868 Latch = nullptr;
10869 Exit = nullptr;
10870}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:466
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:523
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1573
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1443
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1753
iterator_range< op_iterator > operands()
Definition Metadata.h:1849
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:247
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:233
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:461
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:626
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1112
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:824
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1666
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:834
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2461
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:294
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2125
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:627
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:401
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1728
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...