LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr const size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510
3511 Builder.restoreIP(*AfterIP);
3512 Builder.CreateStore(Reduced, LHSPtr);
3513 }
3514 }
3515
3516 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3517 for (auto En : enumerate(ReductionInfos)) {
3518 unsigned Index = En.index();
3519 const ReductionInfo &RI = En.value();
3520 Value *LHSFixupPtr, *RHSFixupPtr;
3521 Builder.restoreIP(RI.ReductionGenClang(
3522 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3523
3524 // Fix the CallBack code genereated to use the correct Values for the LHS
3525 // and RHS
3526 LHSFixupPtr->replaceUsesWithIf(
3527 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3528 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3529 ReductionFunc;
3530 });
3531 RHSFixupPtr->replaceUsesWithIf(
3532 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3533 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3534 ReductionFunc;
3535 });
3536 }
3537
3538 Builder.CreateRetVoid();
3539 return ReductionFunc;
3540}
3541
3542static void
3544 bool IsGPU) {
3545 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3546 (void)RI;
3547 assert(RI.Variable && "expected non-null variable");
3548 assert(RI.PrivateVariable && "expected non-null private variable");
3549 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3550 "expected non-null reduction generator callback");
3551 if (!IsGPU) {
3552 assert(
3553 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3554 "expected variables and their private equivalents to have the same "
3555 "type");
3556 }
3557 assert(RI.Variable->getType()->isPointerTy() &&
3558 "expected variables to be pointers");
3559 }
3560}
3561
3562OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3563 const LocationDescription &Loc, InsertPointTy AllocaIP,
3564 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3565 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3566 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3567 Value *SrcLocInfo) {
3568 if (!updateToLocation(Loc))
3569 return InsertPointTy();
3570 Builder.restoreIP(CodeGenIP);
3571 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3572 LLVMContext &Ctx = M.getContext();
3573
3574 // Source location for the ident struct
3575 if (!SrcLocInfo) {
3576 uint32_t SrcLocStrSize;
3577 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3578 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3579 }
3580
3581 if (ReductionInfos.size() == 0)
3582 return Builder.saveIP();
3583
3584 BasicBlock *ContinuationBlock = nullptr;
3585 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3586 // Copied code from createReductions
3587 BasicBlock *InsertBlock = Loc.IP.getBlock();
3588 ContinuationBlock =
3589 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3590 InsertBlock->getTerminator()->eraseFromParent();
3591 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3592 }
3593
3594 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3595 AttributeList FuncAttrs;
3596 AttrBuilder AttrBldr(Ctx);
3597 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3598 AttrBldr.addAttribute(Attr);
3599 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3600 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3601
3602 CodeGenIP = Builder.saveIP();
3603 Expected<Function *> ReductionResult =
3604 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3605 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3606 if (!ReductionResult)
3607 return ReductionResult.takeError();
3608 Function *ReductionFunc = *ReductionResult;
3609 Builder.restoreIP(CodeGenIP);
3610
3611 // Set the grid value in the config needed for lowering later on
3612 if (GridValue.has_value())
3613 Config.setGridValue(GridValue.value());
3614 else
3615 Config.setGridValue(getGridValue(T, ReductionFunc));
3616
3617 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3618 // RedList, shuffle_reduce_func, interwarp_copy_func);
3619 // or
3620 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3621 Value *Res;
3622
3623 // 1. Build a list of reduction variables.
3624 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3625 auto Size = ReductionInfos.size();
3626 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3627 Type *FuncPtrTy =
3628 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3629 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3630 CodeGenIP = Builder.saveIP();
3631 Builder.restoreIP(AllocaIP);
3632 Value *ReductionListAlloca =
3633 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3634 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3635 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3636 Builder.restoreIP(CodeGenIP);
3637 Type *IndexTy = Builder.getIndexTy(
3638 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3639 for (auto En : enumerate(ReductionInfos)) {
3640 const ReductionInfo &RI = En.value();
3641 Value *ElemPtr = Builder.CreateInBoundsGEP(
3642 RedArrayTy, ReductionList,
3643 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3644 Value *CastElem =
3645 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3646 Builder.CreateStore(CastElem, ElemPtr);
3647 }
3648 CodeGenIP = Builder.saveIP();
3649 Function *SarFunc =
3650 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3651 Expected<Function *> CopyResult =
3652 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3653 if (!CopyResult)
3654 return CopyResult.takeError();
3655 Function *WcFunc = *CopyResult;
3656 Builder.restoreIP(CodeGenIP);
3657
3658 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3659
3660 unsigned MaxDataSize = 0;
3661 SmallVector<Type *> ReductionTypeArgs;
3662 for (auto En : enumerate(ReductionInfos)) {
3663 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3664 if (Size > MaxDataSize)
3665 MaxDataSize = Size;
3666 ReductionTypeArgs.emplace_back(En.value().ElementType);
3667 }
3668 Value *ReductionDataSize =
3669 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3670 if (!IsTeamsReduction) {
3671 Value *SarFuncCast =
3672 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
3673 Value *WcFuncCast =
3674 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3675 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3676 WcFuncCast};
3677 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3678 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3679 Res = Builder.CreateCall(Pv2Ptr, Args);
3680 } else {
3681 CodeGenIP = Builder.saveIP();
3682 StructType *ReductionsBufferTy = StructType::create(
3683 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3684 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3685 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3686 Function *LtGCFunc = emitListToGlobalCopyFunction(
3687 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3688 Function *LtGRFunc = emitListToGlobalReduceFunction(
3689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3690 Function *GtLCFunc = emitGlobalToListCopyFunction(
3691 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3692 Function *GtLRFunc = emitGlobalToListReduceFunction(
3693 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3694 Builder.restoreIP(CodeGenIP);
3695
3696 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3697 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3698
3699 Value *Args3[] = {SrcLocInfo,
3700 KernelTeamsReductionPtr,
3701 Builder.getInt32(ReductionBufNum),
3702 ReductionDataSize,
3703 RL,
3704 SarFunc,
3705 WcFunc,
3706 LtGCFunc,
3707 LtGRFunc,
3708 GtLCFunc,
3709 GtLRFunc};
3710
3711 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3712 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3713 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3714 }
3715
3716 // 5. Build if (res == 1)
3717 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3718 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3719 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3720 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3721
3722 // 6. Build then branch: where we have reduced values in the master
3723 // thread in each team.
3724 // __kmpc_end_reduce{_nowait}(<gtid>);
3725 // break;
3726 emitBlock(ThenBB, CurFunc);
3727
3728 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3729 for (auto En : enumerate(ReductionInfos)) {
3730 const ReductionInfo &RI = En.value();
3731 Value *LHS = RI.Variable;
3732 Value *RHS =
3733 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3734
3735 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3736 Value *LHSPtr, *RHSPtr;
3737 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3738 &LHSPtr, &RHSPtr, CurFunc));
3739
3740 // Fix the CallBack code genereated to use the correct Values for the LHS
3741 // and RHS
3742 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3743 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3744 ReductionFunc;
3745 });
3746 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3747 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3748 ReductionFunc;
3749 });
3750 } else {
3751 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3752 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3753 Value *Reduced;
3754 InsertPointOrErrorTy AfterIP =
3755 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3756 if (!AfterIP)
3757 return AfterIP.takeError();
3758 Builder.restoreIP(*AfterIP);
3759 Builder.CreateStore(Reduced, LHS, false);
3760 }
3761 }
3762 emitBlock(ExitBB, CurFunc);
3763 if (ContinuationBlock) {
3764 Builder.CreateBr(ContinuationBlock);
3765 Builder.SetInsertPoint(ContinuationBlock);
3766 }
3767 Config.setEmitLLVMUsed();
3768
3769 return Builder.saveIP();
3770}
3771
3773 Type *VoidTy = Type::getVoidTy(M.getContext());
3774 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3775 auto *FuncTy =
3776 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3778 ".omp.reduction.func", &M);
3779}
3780
3782 Function *ReductionFunc,
3784 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3785 Module *Module = ReductionFunc->getParent();
3786 BasicBlock *ReductionFuncBlock =
3787 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3788 Builder.SetInsertPoint(ReductionFuncBlock);
3789 Value *LHSArrayPtr = nullptr;
3790 Value *RHSArrayPtr = nullptr;
3791 if (IsGPU) {
3792 // Need to alloca memory here and deal with the pointers before getting
3793 // LHS/RHS pointers out
3794 //
3795 Argument *Arg0 = ReductionFunc->getArg(0);
3796 Argument *Arg1 = ReductionFunc->getArg(1);
3797 Type *Arg0Type = Arg0->getType();
3798 Type *Arg1Type = Arg1->getType();
3799
3800 Value *LHSAlloca =
3801 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3802 Value *RHSAlloca =
3803 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3804 Value *LHSAddrCast =
3805 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3806 Value *RHSAddrCast =
3807 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3808 Builder.CreateStore(Arg0, LHSAddrCast);
3809 Builder.CreateStore(Arg1, RHSAddrCast);
3810 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3811 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3812 } else {
3813 LHSArrayPtr = ReductionFunc->getArg(0);
3814 RHSArrayPtr = ReductionFunc->getArg(1);
3815 }
3816
3817 unsigned NumReductions = ReductionInfos.size();
3818 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3819
3820 for (auto En : enumerate(ReductionInfos)) {
3821 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3822 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3823 RedArrayTy, LHSArrayPtr, 0, En.index());
3824 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3825 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3826 LHSI8Ptr, RI.Variable->getType());
3827 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3828 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3829 RedArrayTy, RHSArrayPtr, 0, En.index());
3830 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3831 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3832 RHSI8Ptr, RI.PrivateVariable->getType());
3833 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3834 Value *Reduced;
3835 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3836 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3837 if (!AfterIP)
3838 return AfterIP.takeError();
3839
3840 Builder.restoreIP(*AfterIP);
3841 // TODO: Consider flagging an error.
3842 if (!Builder.GetInsertBlock())
3843 return Error::success();
3844
3845 // store is inside of the reduction region when using by-ref
3846 if (!IsByRef[En.index()])
3847 Builder.CreateStore(Reduced, LHSPtr);
3848 }
3849 Builder.CreateRetVoid();
3850 return Error::success();
3851}
3852
3853OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3854 const LocationDescription &Loc, InsertPointTy AllocaIP,
3855 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3856 bool IsNoWait, bool IsTeamsReduction) {
3857 assert(ReductionInfos.size() == IsByRef.size());
3858 if (Config.isGPU())
3859 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3860 IsNoWait, IsTeamsReduction);
3861
3862 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3863
3864 if (!updateToLocation(Loc))
3865 return InsertPointTy();
3866
3867 if (ReductionInfos.size() == 0)
3868 return Builder.saveIP();
3869
3870 BasicBlock *InsertBlock = Loc.IP.getBlock();
3871 BasicBlock *ContinuationBlock =
3872 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3873 InsertBlock->getTerminator()->eraseFromParent();
3874
3875 // Create and populate array of type-erased pointers to private reduction
3876 // values.
3877 unsigned NumReductions = ReductionInfos.size();
3878 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3879 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3880 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3881
3882 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3883
3884 for (auto En : enumerate(ReductionInfos)) {
3885 unsigned Index = En.index();
3886 const ReductionInfo &RI = En.value();
3887 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3888 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3889 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3890 }
3891
3892 // Emit a call to the runtime function that orchestrates the reduction.
3893 // Declare the reduction function in the process.
3894 Type *IndexTy = Builder.getIndexTy(
3895 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3896 Function *Func = Builder.GetInsertBlock()->getParent();
3897 Module *Module = Func->getParent();
3898 uint32_t SrcLocStrSize;
3899 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3900 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3901 return RI.AtomicReductionGen;
3902 });
3903 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3904 CanGenerateAtomic
3905 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3906 : IdentFlag(0));
3907 Value *ThreadId = getOrCreateThreadID(Ident);
3908 Constant *NumVariables = Builder.getInt32(NumReductions);
3909 const DataLayout &DL = Module->getDataLayout();
3910 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3911 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3912 Function *ReductionFunc = getFreshReductionFunc(*Module);
3913 Value *Lock = getOMPCriticalRegionLock(".reduction");
3914 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3915 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3916 : RuntimeFunction::OMPRTL___kmpc_reduce);
3917 CallInst *ReduceCall =
3918 Builder.CreateCall(ReduceFunc,
3919 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3920 ReductionFunc, Lock},
3921 "reduce");
3922
3923 // Create final reduction entry blocks for the atomic and non-atomic case.
3924 // Emit IR that dispatches control flow to one of the blocks based on the
3925 // reduction supporting the atomic mode.
3926 BasicBlock *NonAtomicRedBlock =
3927 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3928 BasicBlock *AtomicRedBlock =
3929 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3931 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3932 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3933 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3934
3935 // Populate the non-atomic reduction using the elementwise reduction function.
3936 // This loads the elements from the global and private variables and reduces
3937 // them before storing back the result to the global variable.
3938 Builder.SetInsertPoint(NonAtomicRedBlock);
3939 for (auto En : enumerate(ReductionInfos)) {
3940 const ReductionInfo &RI = En.value();
3941 Type *ValueType = RI.ElementType;
3942 // We have one less load for by-ref case because that load is now inside of
3943 // the reduction region
3944 Value *RedValue = RI.Variable;
3945 if (!IsByRef[En.index()]) {
3946 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3947 "red.value." + Twine(En.index()));
3948 }
3949 Value *PrivateRedValue =
3950 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3951 "red.private.value." + Twine(En.index()));
3952 Value *Reduced;
3953 InsertPointOrErrorTy AfterIP =
3954 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3955 if (!AfterIP)
3956 return AfterIP.takeError();
3957 Builder.restoreIP(*AfterIP);
3958
3959 if (!Builder.GetInsertBlock())
3960 return InsertPointTy();
3961 // for by-ref case, the load is inside of the reduction region
3962 if (!IsByRef[En.index()])
3963 Builder.CreateStore(Reduced, RI.Variable);
3964 }
3965 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3966 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3967 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3968 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3969 Builder.CreateBr(ContinuationBlock);
3970
3971 // Populate the atomic reduction using the atomic elementwise reduction
3972 // function. There are no loads/stores here because they will be happening
3973 // inside the atomic elementwise reduction.
3974 Builder.SetInsertPoint(AtomicRedBlock);
3975 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3976 for (const ReductionInfo &RI : ReductionInfos) {
3977 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3978 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3979 if (!AfterIP)
3980 return AfterIP.takeError();
3981 Builder.restoreIP(*AfterIP);
3982 if (!Builder.GetInsertBlock())
3983 return InsertPointTy();
3984 }
3985 Builder.CreateBr(ContinuationBlock);
3986 } else {
3987 Builder.CreateUnreachable();
3988 }
3989
3990 // Populate the outlined reduction function using the elementwise reduction
3991 // function. Partial values are extracted from the type-erased array of
3992 // pointers to private variables.
3993 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3994 IsByRef, /*isGPU=*/false);
3995 if (Err)
3996 return Err;
3997
3998 if (!Builder.GetInsertBlock())
3999 return InsertPointTy();
4000
4001 Builder.SetInsertPoint(ContinuationBlock);
4002 return Builder.saveIP();
4003}
4004
4005OpenMPIRBuilder::InsertPointOrErrorTy
4006OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4007 BodyGenCallbackTy BodyGenCB,
4008 FinalizeCallbackTy FiniCB) {
4009 if (!updateToLocation(Loc))
4010 return Loc.IP;
4011
4012 Directive OMPD = Directive::OMPD_master;
4013 uint32_t SrcLocStrSize;
4014 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4016 Value *ThreadId = getOrCreateThreadID(Ident);
4017 Value *Args[] = {Ident, ThreadId};
4018
4019 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4020 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4021
4022 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4023 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4024
4025 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4026 /*Conditional*/ true, /*hasFinalize*/ true);
4027}
4028
4029OpenMPIRBuilder::InsertPointOrErrorTy
4030OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4031 BodyGenCallbackTy BodyGenCB,
4032 FinalizeCallbackTy FiniCB, Value *Filter) {
4033 if (!updateToLocation(Loc))
4034 return Loc.IP;
4035
4036 Directive OMPD = Directive::OMPD_masked;
4037 uint32_t SrcLocStrSize;
4038 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4039 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4040 Value *ThreadId = getOrCreateThreadID(Ident);
4041 Value *Args[] = {Ident, ThreadId, Filter};
4042 Value *ArgsEnd[] = {Ident, ThreadId};
4043
4044 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4045 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4046
4047 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4048 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4049
4050 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4051 /*Conditional*/ true, /*hasFinalize*/ true);
4052}
4053
4055 llvm::FunctionCallee Callee,
4057 const llvm::Twine &Name) {
4058 llvm::CallInst *Call = Builder.CreateCall(
4059 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4060 Call->setDoesNotThrow();
4061 return Call;
4062}
4063
4064// Expects input basic block is dominated by BeforeScanBB.
4065// Once Scan directive is encountered, the code after scan directive should be
4066// dominated by AfterScanBB. Scan directive splits the code sequence to
4067// scan and input phase. Based on whether inclusive or exclusive
4068// clause is used in the scan directive and whether input loop or scan loop
4069// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4070// input loop and second is the scan loop. The code generated handles only
4071// inclusive scans now.
4072OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4073 const LocationDescription &Loc, InsertPointTy AllocaIP,
4074 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4075 bool IsInclusive, ScanInfo *ScanRedInfo) {
4076 if (ScanRedInfo->OMPFirstScanLoop) {
4077 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4078 ScanVarsType, ScanRedInfo);
4079 if (Err)
4080 return Err;
4081 }
4082 if (!updateToLocation(Loc))
4083 return Loc.IP;
4084
4085 llvm::Value *IV = ScanRedInfo->IV;
4086
4087 if (ScanRedInfo->OMPFirstScanLoop) {
4088 // Emit buffer[i] = red; at the end of the input phase.
4089 for (size_t i = 0; i < ScanVars.size(); i++) {
4090 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4091 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4092 Type *DestTy = ScanVarsType[i];
4093 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4094 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4095
4096 Builder.CreateStore(Src, Val);
4097 }
4098 }
4099 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4100 emitBlock(ScanRedInfo->OMPScanDispatch,
4101 Builder.GetInsertBlock()->getParent());
4102
4103 if (!ScanRedInfo->OMPFirstScanLoop) {
4104 IV = ScanRedInfo->IV;
4105 // Emit red = buffer[i]; at the entrance to the scan phase.
4106 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4107 for (size_t i = 0; i < ScanVars.size(); i++) {
4108 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4109 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4110 Type *DestTy = ScanVarsType[i];
4111 Value *SrcPtr =
4112 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4113 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4114 Builder.CreateStore(Src, ScanVars[i]);
4115 }
4116 }
4117
4118 // TODO: Update it to CreateBr and remove dead blocks
4119 llvm::Value *CmpI = Builder.getInt1(true);
4120 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4121 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4122 ScanRedInfo->OMPAfterScanBlock);
4123 } else {
4124 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4125 ScanRedInfo->OMPBeforeScanBlock);
4126 }
4127 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4128 Builder.GetInsertBlock()->getParent());
4129 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4130 return Builder.saveIP();
4131}
4132
4133Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4134 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4135 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4136
4137 Builder.restoreIP(AllocaIP);
4138 // Create the shared pointer at alloca IP.
4139 for (size_t i = 0; i < ScanVars.size(); i++) {
4140 llvm::Value *BuffPtr =
4141 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4142 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4143 }
4144
4145 // Allocate temporary buffer by master thread
4146 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4147 InsertPointTy CodeGenIP) -> Error {
4148 Builder.restoreIP(CodeGenIP);
4149 Value *AllocSpan =
4150 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4151 for (size_t i = 0; i < ScanVars.size(); i++) {
4152 Type *IntPtrTy = Builder.getInt32Ty();
4153 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4154 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4155 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4156 AllocSpan, nullptr, "arr");
4157 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4158 }
4159 return Error::success();
4160 };
4161 // TODO: Perform finalization actions for variables. This has to be
4162 // called for variables which have destructors/finalizers.
4163 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4164
4165 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4166 llvm::Value *FilterVal = Builder.getInt32(0);
4167 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4168 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4169
4170 if (!AfterIP)
4171 return AfterIP.takeError();
4172 Builder.restoreIP(*AfterIP);
4173 BasicBlock *InputBB = Builder.GetInsertBlock();
4174 if (InputBB->getTerminator())
4175 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4176 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4177 if (!AfterIP)
4178 return AfterIP.takeError();
4179 Builder.restoreIP(*AfterIP);
4180
4181 return Error::success();
4182}
4183
4184Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4185 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4186 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4187 InsertPointTy CodeGenIP) -> Error {
4188 Builder.restoreIP(CodeGenIP);
4189 for (ReductionInfo RedInfo : ReductionInfos) {
4190 Value *PrivateVar = RedInfo.PrivateVariable;
4191 Value *OrigVar = RedInfo.Variable;
4192 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4193 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4194
4195 Type *SrcTy = RedInfo.ElementType;
4196 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4197 "arrayOffset");
4198 Value *Src = Builder.CreateLoad(SrcTy, Val);
4199
4200 Builder.CreateStore(Src, OrigVar);
4201 Builder.CreateFree(Buff);
4202 }
4203 return Error::success();
4204 };
4205 // TODO: Perform finalization actions for variables. This has to be
4206 // called for variables which have destructors/finalizers.
4207 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4208
4209 if (ScanRedInfo->OMPScanFinish->getTerminator())
4210 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4211 else
4212 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4213
4214 llvm::Value *FilterVal = Builder.getInt32(0);
4215 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4216 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4217
4218 if (!AfterIP)
4219 return AfterIP.takeError();
4220 Builder.restoreIP(*AfterIP);
4221 BasicBlock *InputBB = Builder.GetInsertBlock();
4222 if (InputBB->getTerminator())
4223 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4224 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4225 if (!AfterIP)
4226 return AfterIP.takeError();
4227 Builder.restoreIP(*AfterIP);
4228 return Error::success();
4229}
4230
4231OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4232 const LocationDescription &Loc,
4234 ScanInfo *ScanRedInfo) {
4235
4236 if (!updateToLocation(Loc))
4237 return Loc.IP;
4238 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4239 InsertPointTy CodeGenIP) -> Error {
4240 Builder.restoreIP(CodeGenIP);
4241 Function *CurFn = Builder.GetInsertBlock()->getParent();
4242 // for (int k = 0; k <= ceil(log2(n)); ++k)
4243 llvm::BasicBlock *LoopBB =
4244 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4245 llvm::BasicBlock *ExitBB =
4246 splitBB(Builder, false, "omp.outer.log.scan.exit");
4248 Builder.GetInsertBlock()->getModule(),
4249 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4250 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4251 llvm::Value *Arg =
4252 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4253 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4255 Builder.GetInsertBlock()->getModule(),
4256 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4257 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4258 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4259 llvm::Value *NMin1 = Builder.CreateNUWSub(
4260 ScanRedInfo->Span,
4261 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4262 Builder.SetInsertPoint(InputBB);
4263 Builder.CreateBr(LoopBB);
4264 emitBlock(LoopBB, CurFn);
4265 Builder.SetInsertPoint(LoopBB);
4266
4267 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4268 // size pow2k = 1;
4269 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4270 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4271 InputBB);
4272 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4273 InputBB);
4274 // for (size i = n - 1; i >= 2 ^ k; --i)
4275 // tmp[i] op= tmp[i-pow2k];
4276 llvm::BasicBlock *InnerLoopBB =
4277 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4278 llvm::BasicBlock *InnerExitBB =
4279 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4280 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4281 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4282 emitBlock(InnerLoopBB, CurFn);
4283 Builder.SetInsertPoint(InnerLoopBB);
4284 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4285 IVal->addIncoming(NMin1, LoopBB);
4286 for (ReductionInfo RedInfo : ReductionInfos) {
4287 Value *ReductionVal = RedInfo.PrivateVariable;
4288 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4289 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4290 Type *DestTy = RedInfo.ElementType;
4291 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4292 Value *LHSPtr =
4293 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4294 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4295 Value *RHSPtr =
4296 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4297 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4298 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4300 InsertPointOrErrorTy AfterIP =
4301 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4302 if (!AfterIP)
4303 return AfterIP.takeError();
4304 Builder.CreateStore(Result, LHSPtr);
4305 }
4306 llvm::Value *NextIVal = Builder.CreateNUWSub(
4307 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4308 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4309 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4310 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4311 emitBlock(InnerExitBB, CurFn);
4312 llvm::Value *Next = Builder.CreateNUWAdd(
4313 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4314 Counter->addIncoming(Next, Builder.GetInsertBlock());
4315 // pow2k <<= 1;
4316 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4317 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4318 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4319 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4320 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4321 return Error::success();
4322 };
4323
4324 // TODO: Perform finalization actions for variables. This has to be
4325 // called for variables which have destructors/finalizers.
4326 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4327
4328 llvm::Value *FilterVal = Builder.getInt32(0);
4329 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4330 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4331
4332 if (!AfterIP)
4333 return AfterIP.takeError();
4334 Builder.restoreIP(*AfterIP);
4335 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4336
4337 if (!AfterIP)
4338 return AfterIP.takeError();
4339 Builder.restoreIP(*AfterIP);
4340 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4341 if (Err)
4342 return Err;
4343
4344 return AfterIP;
4345}
4346
4347Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4348 llvm::function_ref<Error()> InputLoopGen,
4349 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4350 ScanInfo *ScanRedInfo) {
4351
4352 {
4353 // Emit loop with input phase:
4354 // for (i: 0..<num_iters>) {
4355 // <input phase>;
4356 // buffer[i] = red;
4357 // }
4358 ScanRedInfo->OMPFirstScanLoop = true;
4359 Error Err = InputLoopGen();
4360 if (Err)
4361 return Err;
4362 }
4363 {
4364 // Emit loop with scan phase:
4365 // for (i: 0..<num_iters>) {
4366 // red = buffer[i];
4367 // <scan phase>;
4368 // }
4369 ScanRedInfo->OMPFirstScanLoop = false;
4370 Error Err = ScanLoopGen(Builder.saveIP());
4371 if (Err)
4372 return Err;
4373 }
4374 return Error::success();
4375}
4376
4377void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4378 Function *Fun = Builder.GetInsertBlock()->getParent();
4379 ScanRedInfo->OMPScanDispatch =
4380 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4381 ScanRedInfo->OMPAfterScanBlock =
4382 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4383 ScanRedInfo->OMPBeforeScanBlock =
4384 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4385 ScanRedInfo->OMPScanLoopExit =
4386 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4387}
4388CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4389 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4390 BasicBlock *PostInsertBefore, const Twine &Name) {
4391 Module *M = F->getParent();
4392 LLVMContext &Ctx = M->getContext();
4393 Type *IndVarTy = TripCount->getType();
4394
4395 // Create the basic block structure.
4396 BasicBlock *Preheader =
4397 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4398 BasicBlock *Header =
4399 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4400 BasicBlock *Cond =
4401 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4402 BasicBlock *Body =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4404 BasicBlock *Latch =
4405 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4406 BasicBlock *Exit =
4407 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4408 BasicBlock *After =
4409 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4410
4411 // Use specified DebugLoc for new instructions.
4412 Builder.SetCurrentDebugLocation(DL);
4413
4414 Builder.SetInsertPoint(Preheader);
4415 Builder.CreateBr(Header);
4416
4417 Builder.SetInsertPoint(Header);
4418 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4419 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4420 Builder.CreateBr(Cond);
4421
4422 Builder.SetInsertPoint(Cond);
4423 Value *Cmp =
4424 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4425 Builder.CreateCondBr(Cmp, Body, Exit);
4426
4427 Builder.SetInsertPoint(Body);
4428 Builder.CreateBr(Latch);
4429
4430 Builder.SetInsertPoint(Latch);
4431 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4432 "omp_" + Name + ".next", /*HasNUW=*/true);
4433 Builder.CreateBr(Header);
4434 IndVarPHI->addIncoming(Next, Latch);
4435
4436 Builder.SetInsertPoint(Exit);
4437 Builder.CreateBr(After);
4438
4439 // Remember and return the canonical control flow.
4440 LoopInfos.emplace_front();
4441 CanonicalLoopInfo *CL = &LoopInfos.front();
4442
4443 CL->Header = Header;
4444 CL->Cond = Cond;
4445 CL->Latch = Latch;
4446 CL->Exit = Exit;
4447
4448#ifndef NDEBUG
4449 CL->assertOK();
4450#endif
4451 return CL;
4452}
4453
4455OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4456 LoopBodyGenCallbackTy BodyGenCB,
4457 Value *TripCount, const Twine &Name) {
4458 BasicBlock *BB = Loc.IP.getBlock();
4459 BasicBlock *NextBB = BB->getNextNode();
4460
4461 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4462 NextBB, NextBB, Name);
4463 BasicBlock *After = CL->getAfter();
4464
4465 // If location is not set, don't connect the loop.
4466 if (updateToLocation(Loc)) {
4467 // Split the loop at the insertion point: Branch to the preheader and move
4468 // every following instruction to after the loop (the After BB). Also, the
4469 // new successor is the loop's after block.
4470 spliceBB(Builder, After, /*CreateBranch=*/false);
4471 Builder.CreateBr(CL->getPreheader());
4472 }
4473
4474 // Emit the body content. We do it after connecting the loop to the CFG to
4475 // avoid that the callback encounters degenerate BBs.
4476 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4477 return Err;
4478
4479#ifndef NDEBUG
4480 CL->assertOK();
4481#endif
4482 return CL;
4483}
4484
4485Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4486 ScanInfos.emplace_front();
4487 ScanInfo *Result = &ScanInfos.front();
4488 return Result;
4489}
4490
4492OpenMPIRBuilder::createCanonicalScanLoops(
4493 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4494 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4495 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4496 LocationDescription ComputeLoc =
4497 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4498 updateToLocation(ComputeLoc);
4499
4501
4502 Value *TripCount = calculateCanonicalLoopTripCount(
4503 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4504 ScanRedInfo->Span = TripCount;
4505 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4506 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4507
4508 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4509 Builder.restoreIP(CodeGenIP);
4510 ScanRedInfo->IV = IV;
4511 createScanBBs(ScanRedInfo);
4512 BasicBlock *InputBlock = Builder.GetInsertBlock();
4513 Instruction *Terminator = InputBlock->getTerminator();
4514 assert(Terminator->getNumSuccessors() == 1);
4515 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4516 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4517 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4518 Builder.GetInsertBlock()->getParent());
4519 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4520 emitBlock(ScanRedInfo->OMPScanLoopExit,
4521 Builder.GetInsertBlock()->getParent());
4522 Builder.CreateBr(ContinueBlock);
4523 Builder.SetInsertPoint(
4524 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4525 return BodyGenCB(Builder.saveIP(), IV);
4526 };
4527
4528 const auto &&InputLoopGen = [&]() -> Error {
4529 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4530 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4531 ComputeIP, Name, true, ScanRedInfo);
4532 if (!LoopInfo)
4533 return LoopInfo.takeError();
4534 Result.push_back(*LoopInfo);
4535 Builder.restoreIP((*LoopInfo)->getAfterIP());
4536 return Error::success();
4537 };
4538 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4540 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4541 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4542 if (!LoopInfo)
4543 return LoopInfo.takeError();
4544 Result.push_back(*LoopInfo);
4545 Builder.restoreIP((*LoopInfo)->getAfterIP());
4546 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4547 return Error::success();
4548 };
4549 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4550 if (Err)
4551 return Err;
4552 return Result;
4553}
4554
4555Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4556 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4557 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4558
4559 // Consider the following difficulties (assuming 8-bit signed integers):
4560 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4561 // DO I = 1, 100, 50
4562 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4563 // DO I = 100, 0, -128
4564
4565 // Start, Stop and Step must be of the same integer type.
4566 auto *IndVarTy = cast<IntegerType>(Start->getType());
4567 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4568 assert(IndVarTy == Step->getType() && "Step type mismatch");
4569
4570 updateToLocation(Loc);
4571
4572 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4573 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4574
4575 // Like Step, but always positive.
4576 Value *Incr = Step;
4577
4578 // Distance between Start and Stop; always positive.
4579 Value *Span;
4580
4581 // Condition whether there are no iterations are executed at all, e.g. because
4582 // UB < LB.
4583 Value *ZeroCmp;
4584
4585 if (IsSigned) {
4586 // Ensure that increment is positive. If not, negate and invert LB and UB.
4587 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4588 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4589 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4590 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4591 Span = Builder.CreateSub(UB, LB, "", false, true);
4592 ZeroCmp = Builder.CreateICmp(
4593 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4594 } else {
4595 Span = Builder.CreateSub(Stop, Start, "", true);
4596 ZeroCmp = Builder.CreateICmp(
4597 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4598 }
4599
4600 Value *CountIfLooping;
4601 if (InclusiveStop) {
4602 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4603 } else {
4604 // Avoid incrementing past stop since it could overflow.
4605 Value *CountIfTwo = Builder.CreateAdd(
4606 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4607 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4608 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4609 }
4610
4611 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4612 "omp_" + Name + ".tripcount");
4613}
4614
4615Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4616 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4617 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4618 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4619 ScanInfo *ScanRedInfo) {
4620 LocationDescription ComputeLoc =
4621 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4622
4623 Value *TripCount = calculateCanonicalLoopTripCount(
4624 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4625
4626 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4627 Builder.restoreIP(CodeGenIP);
4628 Value *Span = Builder.CreateMul(IV, Step);
4629 Value *IndVar = Builder.CreateAdd(Span, Start);
4630 if (InScan)
4631 ScanRedInfo->IV = IndVar;
4632 return BodyGenCB(Builder.saveIP(), IndVar);
4633 };
4634 LocationDescription LoopLoc =
4635 ComputeIP.isSet()
4636 ? Loc
4637 : LocationDescription(Builder.saveIP(),
4638 Builder.getCurrentDebugLocation());
4639 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4640}
4641
4642// Returns an LLVM function to call for initializing loop bounds using OpenMP
4643// static scheduling for composite `distribute parallel for` depending on
4644// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4645// integers as unsigned similarly to CanonicalLoopInfo.
4646static FunctionCallee
4648 OpenMPIRBuilder &OMPBuilder) {
4649 unsigned Bitwidth = Ty->getIntegerBitWidth();
4650 if (Bitwidth == 32)
4651 return OMPBuilder.getOrCreateRuntimeFunction(
4652 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4653 if (Bitwidth == 64)
4654 return OMPBuilder.getOrCreateRuntimeFunction(
4655 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4656 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4657}
4658
4659// Returns an LLVM function to call for initializing loop bounds using OpenMP
4660// static scheduling depending on `type`. Only i32 and i64 are supported by the
4661// runtime. Always interpret integers as unsigned similarly to
4662// CanonicalLoopInfo.
4664 OpenMPIRBuilder &OMPBuilder) {
4665 unsigned Bitwidth = Ty->getIntegerBitWidth();
4666 if (Bitwidth == 32)
4667 return OMPBuilder.getOrCreateRuntimeFunction(
4668 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4669 if (Bitwidth == 64)
4670 return OMPBuilder.getOrCreateRuntimeFunction(
4671 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4672 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4673}
4674
4675OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4676 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4677 WorksharingLoopType LoopType, bool NeedsBarrier) {
4678 assert(CLI->isValid() && "Requires a valid canonical loop");
4679 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4680 "Require dedicated allocate IP");
4681
4682 // Set up the source location value for OpenMP runtime.
4683 Builder.restoreIP(CLI->getPreheaderIP());
4684 Builder.SetCurrentDebugLocation(DL);
4685
4686 uint32_t SrcLocStrSize;
4687 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4688 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4689
4690 // Declare useful OpenMP runtime functions.
4691 Value *IV = CLI->getIndVar();
4692 Type *IVTy = IV->getType();
4693 FunctionCallee StaticInit =
4694 LoopType == WorksharingLoopType::DistributeForStaticLoop
4695 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4696 : getKmpcForStaticInitForType(IVTy, M, *this);
4697 FunctionCallee StaticFini =
4698 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4699
4700 // Allocate space for computed loop bounds as expected by the "init" function.
4701 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4702
4703 Type *I32Type = Type::getInt32Ty(M.getContext());
4704 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4705 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4706 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4707 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4708 CLI->setLastIter(PLastIter);
4709
4710 // At the end of the preheader, prepare for calling the "init" function by
4711 // storing the current loop bounds into the allocated space. A canonical loop
4712 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4713 // and produces an inclusive upper bound.
4714 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4715 Constant *Zero = ConstantInt::get(IVTy, 0);
4716 Constant *One = ConstantInt::get(IVTy, 1);
4717 Builder.CreateStore(Zero, PLowerBound);
4718 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4719 Builder.CreateStore(UpperBound, PUpperBound);
4720 Builder.CreateStore(One, PStride);
4721
4722 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4723
4724 OMPScheduleType SchedType =
4725 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4726 ? OMPScheduleType::OrderedDistribute
4728 Constant *SchedulingType =
4729 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4730
4731 // Call the "init" function and update the trip count of the loop with the
4732 // value it produced.
4734 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4735 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4736 Value *PDistUpperBound =
4737 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4738 Args.push_back(PDistUpperBound);
4739 }
4740 Args.append({PStride, One, Zero});
4741 Builder.CreateCall(StaticInit, Args);
4742 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4743 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4744 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4745 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4746 CLI->setTripCount(TripCount);
4747
4748 // Update all uses of the induction variable except the one in the condition
4749 // block that compares it with the actual upper bound, and the increment in
4750 // the latch block.
4751
4752 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4753 Builder.SetInsertPoint(CLI->getBody(),
4754 CLI->getBody()->getFirstInsertionPt());
4755 Builder.SetCurrentDebugLocation(DL);
4756 return Builder.CreateAdd(OldIV, LowerBound);
4757 });
4758
4759 // In the "exit" block, call the "fini" function.
4760 Builder.SetInsertPoint(CLI->getExit(),
4761 CLI->getExit()->getTerminator()->getIterator());
4762 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4763
4764 // Add the barrier if requested.
4765 if (NeedsBarrier) {
4766 InsertPointOrErrorTy BarrierIP =
4767 createBarrier(LocationDescription(Builder.saveIP(), DL),
4768 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4769 /* CheckCancelFlag */ false);
4770 if (!BarrierIP)
4771 return BarrierIP.takeError();
4772 }
4773
4774 InsertPointTy AfterIP = CLI->getAfterIP();
4775 CLI->invalidate();
4776
4777 return AfterIP;
4778}
4779
4780OpenMPIRBuilder::InsertPointOrErrorTy
4781OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4782 CanonicalLoopInfo *CLI,
4783 InsertPointTy AllocaIP,
4784 bool NeedsBarrier,
4785 Value *ChunkSize) {
4786 assert(CLI->isValid() && "Requires a valid canonical loop");
4787 assert(ChunkSize && "Chunk size is required");
4788
4789 LLVMContext &Ctx = CLI->getFunction()->getContext();
4790 Value *IV = CLI->getIndVar();
4791 Value *OrigTripCount = CLI->getTripCount();
4792 Type *IVTy = IV->getType();
4793 assert(IVTy->getIntegerBitWidth() <= 64 &&
4794 "Max supported tripcount bitwidth is 64 bits");
4795 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4796 : Type::getInt64Ty(Ctx);
4797 Type *I32Type = Type::getInt32Ty(M.getContext());
4798 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4799 Constant *One = ConstantInt::get(InternalIVTy, 1);
4800
4801 // Declare useful OpenMP runtime functions.
4802 FunctionCallee StaticInit =
4803 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4804 FunctionCallee StaticFini =
4805 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4806
4807 // Allocate space for computed loop bounds as expected by the "init" function.
4808 Builder.restoreIP(AllocaIP);
4809 Builder.SetCurrentDebugLocation(DL);
4810 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4811 Value *PLowerBound =
4812 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4813 Value *PUpperBound =
4814 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4815 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4816 CLI->setLastIter(PLastIter);
4817
4818 // Set up the source location value for the OpenMP runtime.
4819 Builder.restoreIP(CLI->getPreheaderIP());
4820 Builder.SetCurrentDebugLocation(DL);
4821
4822 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4823 Value *CastedChunkSize =
4824 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4825 Value *CastedTripCount =
4826 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4827
4828 Constant *SchedulingType = ConstantInt::get(
4829 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4830 Builder.CreateStore(Zero, PLowerBound);
4831 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4832 Builder.CreateStore(OrigUpperBound, PUpperBound);
4833 Builder.CreateStore(One, PStride);
4834
4835 // Call the "init" function and update the trip count of the loop with the
4836 // value it produced.
4837 uint32_t SrcLocStrSize;
4838 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4839 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4840 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4841 Builder.CreateCall(StaticInit,
4842 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4843 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4844 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4845 /*pstride=*/PStride, /*incr=*/One,
4846 /*chunk=*/CastedChunkSize});
4847
4848 // Load values written by the "init" function.
4849 Value *FirstChunkStart =
4850 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4851 Value *FirstChunkStop =
4852 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4853 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4854 Value *ChunkRange =
4855 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4856 Value *NextChunkStride =
4857 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4858
4859 // Create outer "dispatch" loop for enumerating the chunks.
4860 BasicBlock *DispatchEnter = splitBB(Builder, true);
4861 Value *DispatchCounter;
4862
4863 // It is safe to assume this didn't return an error because the callback
4864 // passed into createCanonicalLoop is the only possible error source, and it
4865 // always returns success.
4866 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4867 {Builder.saveIP(), DL},
4868 [&](InsertPointTy BodyIP, Value *Counter) {
4869 DispatchCounter = Counter;
4870 return Error::success();
4871 },
4872 FirstChunkStart, CastedTripCount, NextChunkStride,
4873 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4874 "dispatch"));
4875
4876 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4877 // not have to preserve the canonical invariant.
4878 BasicBlock *DispatchBody = DispatchCLI->getBody();
4879 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4880 BasicBlock *DispatchExit = DispatchCLI->getExit();
4881 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4882 DispatchCLI->invalidate();
4883
4884 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4885 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4886 redirectTo(CLI->getExit(), DispatchLatch, DL);
4887 redirectTo(DispatchBody, DispatchEnter, DL);
4888
4889 // Prepare the prolog of the chunk loop.
4890 Builder.restoreIP(CLI->getPreheaderIP());
4891 Builder.SetCurrentDebugLocation(DL);
4892
4893 // Compute the number of iterations of the chunk loop.
4894 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4895 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4896 Value *IsLastChunk =
4897 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4898 Value *CountUntilOrigTripCount =
4899 Builder.CreateSub(CastedTripCount, DispatchCounter);
4900 Value *ChunkTripCount = Builder.CreateSelect(
4901 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4902 Value *BackcastedChunkTC =
4903 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4904 CLI->setTripCount(BackcastedChunkTC);
4905
4906 // Update all uses of the induction variable except the one in the condition
4907 // block that compares it with the actual upper bound, and the increment in
4908 // the latch block.
4909 Value *BackcastedDispatchCounter =
4910 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4911 CLI->mapIndVar([&](Instruction *) -> Value * {
4912 Builder.restoreIP(CLI->getBodyIP());
4913 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4914 });
4915
4916 // In the "exit" block, call the "fini" function.
4917 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4918 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4919
4920 // Add the barrier if requested.
4921 if (NeedsBarrier) {
4922 InsertPointOrErrorTy AfterIP =
4923 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4924 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4925 if (!AfterIP)
4926 return AfterIP.takeError();
4927 }
4928
4929#ifndef NDEBUG
4930 // Even though we currently do not support applying additional methods to it,
4931 // the chunk loop should remain a canonical loop.
4932 CLI->assertOK();
4933#endif
4934
4935 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4936}
4937
4938// Returns an LLVM function to call for executing an OpenMP static worksharing
4939// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4940// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4941static FunctionCallee
4942getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4943 WorksharingLoopType LoopType) {
4944 unsigned Bitwidth = Ty->getIntegerBitWidth();
4945 Module &M = OMPBuilder->M;
4946 switch (LoopType) {
4947 case WorksharingLoopType::ForStaticLoop:
4948 if (Bitwidth == 32)
4949 return OMPBuilder->getOrCreateRuntimeFunction(
4950 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4951 if (Bitwidth == 64)
4952 return OMPBuilder->getOrCreateRuntimeFunction(
4953 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4954 break;
4955 case WorksharingLoopType::DistributeStaticLoop:
4956 if (Bitwidth == 32)
4957 return OMPBuilder->getOrCreateRuntimeFunction(
4958 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4959 if (Bitwidth == 64)
4960 return OMPBuilder->getOrCreateRuntimeFunction(
4961 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4962 break;
4963 case WorksharingLoopType::DistributeForStaticLoop:
4964 if (Bitwidth == 32)
4965 return OMPBuilder->getOrCreateRuntimeFunction(
4966 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4967 if (Bitwidth == 64)
4968 return OMPBuilder->getOrCreateRuntimeFunction(
4969 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4970 break;
4971 }
4972 if (Bitwidth != 32 && Bitwidth != 64) {
4973 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4974 }
4975 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4976}
4977
4978// Inserts a call to proper OpenMP Device RTL function which handles
4979// loop worksharing.
4980static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4981 WorksharingLoopType LoopType,
4982 BasicBlock *InsertBlock, Value *Ident,
4983 Value *LoopBodyArg, Value *TripCount,
4984 Function &LoopBodyFn, bool NoLoop) {
4985 Type *TripCountTy = TripCount->getType();
4986 Module &M = OMPBuilder->M;
4987 IRBuilder<> &Builder = OMPBuilder->Builder;
4988 FunctionCallee RTLFn =
4989 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4990 SmallVector<Value *, 8> RealArgs;
4991 RealArgs.push_back(Ident);
4992 RealArgs.push_back(&LoopBodyFn);
4993 RealArgs.push_back(LoopBodyArg);
4994 RealArgs.push_back(TripCount);
4995 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4996 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4997 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4998 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4999 Builder.CreateCall(RTLFn, RealArgs);
5000 return;
5001 }
5002 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5003 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5004 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5005 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5006
5007 RealArgs.push_back(
5008 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5009 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5010 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5011 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5012 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5013 } else {
5014 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5015 }
5016
5017 Builder.CreateCall(RTLFn, RealArgs);
5018}
5019
5021 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5022 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5023 WorksharingLoopType LoopType, bool NoLoop) {
5024 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5025 BasicBlock *Preheader = CLI->getPreheader();
5026 Value *TripCount = CLI->getTripCount();
5027
5028 // After loop body outling, the loop body contains only set up
5029 // of loop body argument structure and the call to the outlined
5030 // loop body function. Firstly, we need to move setup of loop body args
5031 // into loop preheader.
5032 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5033 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5034
5035 // The next step is to remove the whole loop. We do not it need anymore.
5036 // That's why make an unconditional branch from loop preheader to loop
5037 // exit block
5038 Builder.restoreIP({Preheader, Preheader->end()});
5039 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5040 Preheader->getTerminator()->eraseFromParent();
5041 Builder.CreateBr(CLI->getExit());
5042
5043 // Delete dead loop blocks
5044 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5045 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5046 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5047 CleanUpInfo.EntryBB = CLI->getHeader();
5048 CleanUpInfo.ExitBB = CLI->getExit();
5049 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5050 DeleteDeadBlocks(BlocksToBeRemoved);
5051
5052 // Find the instruction which corresponds to loop body argument structure
5053 // and remove the call to loop body function instruction.
5054 Value *LoopBodyArg;
5055 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5056 assert(OutlinedFnUser &&
5057 "Expected unique undroppable user of outlined function");
5058 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5059 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5060 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5061 "Expected outlined function call to be located in loop preheader");
5062 // Check in case no argument structure has been passed.
5063 if (OutlinedFnCallInstruction->arg_size() > 1)
5064 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5065 else
5066 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5067 OutlinedFnCallInstruction->eraseFromParent();
5068
5069 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5070 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5071
5072 for (auto &ToBeDeletedItem : ToBeDeleted)
5073 ToBeDeletedItem->eraseFromParent();
5074 CLI->invalidate();
5075}
5076
5077OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5078 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5079 WorksharingLoopType LoopType, bool NoLoop) {
5080 uint32_t SrcLocStrSize;
5081 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5082 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5083
5084 OutlineInfo OI;
5085 OI.OuterAllocaBB = CLI->getPreheader();
5086 Function *OuterFn = CLI->getPreheader()->getParent();
5087
5088 // Instructions which need to be deleted at the end of code generation
5090
5091 OI.OuterAllocaBB = AllocaIP.getBlock();
5092
5093 // Mark the body loop as region which needs to be extracted
5094 OI.EntryBB = CLI->getBody();
5095 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5096 "omp.prelatch", true);
5097
5098 // Prepare loop body for extraction
5099 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5100
5101 // Insert new loop counter variable which will be used only in loop
5102 // body.
5103 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5104 Instruction *NewLoopCntLoad =
5105 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5106 // New loop counter instructions are redundant in the loop preheader when
5107 // code generation for workshare loop is finshed. That's why mark them as
5108 // ready for deletion.
5109 ToBeDeleted.push_back(NewLoopCntLoad);
5110 ToBeDeleted.push_back(NewLoopCnt);
5111
5112 // Analyse loop body region. Find all input variables which are used inside
5113 // loop body region.
5114 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5116 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5117
5118 CodeExtractorAnalysisCache CEAC(*OuterFn);
5119 CodeExtractor Extractor(Blocks,
5120 /* DominatorTree */ nullptr,
5121 /* AggregateArgs */ true,
5122 /* BlockFrequencyInfo */ nullptr,
5123 /* BranchProbabilityInfo */ nullptr,
5124 /* AssumptionCache */ nullptr,
5125 /* AllowVarArgs */ true,
5126 /* AllowAlloca */ true,
5127 /* AllocationBlock */ CLI->getPreheader(),
5128 /* Suffix */ ".omp_wsloop",
5129 /* AggrArgsIn0AddrSpace */ true);
5130
5131 BasicBlock *CommonExit = nullptr;
5132 SetVector<Value *> SinkingCands, HoistingCands;
5133
5134 // Find allocas outside the loop body region which are used inside loop
5135 // body
5136 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5137
5138 // We need to model loop body region as the function f(cnt, loop_arg).
5139 // That's why we replace loop induction variable by the new counter
5140 // which will be one of loop body function argument
5141 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5142 CLI->getIndVar()->user_end());
5143 for (auto Use : Users) {
5144 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5145 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5146 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5147 }
5148 }
5149 }
5150 // Make sure that loop counter variable is not merged into loop body
5151 // function argument structure and it is passed as separate variable
5152 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5153
5154 // PostOutline CB is invoked when loop body function is outlined and
5155 // loop body is replaced by call to outlined function. We need to add
5156 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5157 // function will handle loop control logic.
5158 //
5159 OI.PostOutlineCB = [=, ToBeDeletedVec =
5160 std::move(ToBeDeleted)](Function &OutlinedFn) {
5161 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5162 LoopType, NoLoop);
5163 };
5164 addOutlineInfo(std::move(OI));
5165 return CLI->getAfterIP();
5166}
5167
5168OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5169 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5170 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5171 bool HasSimdModifier, bool HasMonotonicModifier,
5172 bool HasNonmonotonicModifier, bool HasOrderedClause,
5173 WorksharingLoopType LoopType, bool NoLoop) {
5174 if (Config.isTargetDevice())
5175 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5176 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5177 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5178 HasNonmonotonicModifier, HasOrderedClause);
5179
5180 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5181 OMPScheduleType::ModifierOrdered;
5182 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5183 case OMPScheduleType::BaseStatic:
5184 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5185 if (IsOrdered)
5186 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5187 NeedsBarrier, ChunkSize);
5188 // FIXME: Monotonicity ignored?
5189 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5190
5191 case OMPScheduleType::BaseStaticChunked:
5192 if (IsOrdered)
5193 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5194 NeedsBarrier, ChunkSize);
5195 // FIXME: Monotonicity ignored?
5196 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5197 ChunkSize);
5198
5199 case OMPScheduleType::BaseRuntime:
5200 case OMPScheduleType::BaseAuto:
5201 case OMPScheduleType::BaseGreedy:
5202 case OMPScheduleType::BaseBalanced:
5203 case OMPScheduleType::BaseSteal:
5204 case OMPScheduleType::BaseGuidedSimd:
5205 case OMPScheduleType::BaseRuntimeSimd:
5206 assert(!ChunkSize &&
5207 "schedule type does not support user-defined chunk sizes");
5208 [[fallthrough]];
5209 case OMPScheduleType::BaseDynamicChunked:
5210 case OMPScheduleType::BaseGuidedChunked:
5211 case OMPScheduleType::BaseGuidedIterativeChunked:
5212 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5213 case OMPScheduleType::BaseStaticBalancedChunked:
5214 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5215 NeedsBarrier, ChunkSize);
5216
5217 default:
5218 llvm_unreachable("Unknown/unimplemented schedule kind");
5219 }
5220}
5221
5222/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5223/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5224/// the runtime. Always interpret integers as unsigned similarly to
5225/// CanonicalLoopInfo.
5226static FunctionCallee
5227getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5228 unsigned Bitwidth = Ty->getIntegerBitWidth();
5229 if (Bitwidth == 32)
5230 return OMPBuilder.getOrCreateRuntimeFunction(
5231 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5232 if (Bitwidth == 64)
5233 return OMPBuilder.getOrCreateRuntimeFunction(
5234 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5235 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5236}
5237
5238/// Returns an LLVM function to call for updating the next loop using OpenMP
5239/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5240/// the runtime. Always interpret integers as unsigned similarly to
5241/// CanonicalLoopInfo.
5242static FunctionCallee
5243getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5244 unsigned Bitwidth = Ty->getIntegerBitWidth();
5245 if (Bitwidth == 32)
5246 return OMPBuilder.getOrCreateRuntimeFunction(
5247 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5248 if (Bitwidth == 64)
5249 return OMPBuilder.getOrCreateRuntimeFunction(
5250 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5251 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5252}
5253
5254/// Returns an LLVM function to call for finalizing the dynamic loop using
5255/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5256/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5257static FunctionCallee
5258getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5259 unsigned Bitwidth = Ty->getIntegerBitWidth();
5260 if (Bitwidth == 32)
5261 return OMPBuilder.getOrCreateRuntimeFunction(
5262 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5263 if (Bitwidth == 64)
5264 return OMPBuilder.getOrCreateRuntimeFunction(
5265 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5266 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5267}
5268
5269OpenMPIRBuilder::InsertPointOrErrorTy
5270OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5271 InsertPointTy AllocaIP,
5272 OMPScheduleType SchedType,
5273 bool NeedsBarrier, Value *Chunk) {
5274 assert(CLI->isValid() && "Requires a valid canonical loop");
5275 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5276 "Require dedicated allocate IP");
5278 "Require valid schedule type");
5279
5280 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5281 OMPScheduleType::ModifierOrdered;
5282
5283 // Set up the source location value for OpenMP runtime.
5284 Builder.SetCurrentDebugLocation(DL);
5285
5286 uint32_t SrcLocStrSize;
5287 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5288 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5289
5290 // Declare useful OpenMP runtime functions.
5291 Value *IV = CLI->getIndVar();
5292 Type *IVTy = IV->getType();
5293 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5294 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5295
5296 // Allocate space for computed loop bounds as expected by the "init" function.
5297 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5298 Type *I32Type = Type::getInt32Ty(M.getContext());
5299 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5300 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5301 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5302 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5303 CLI->setLastIter(PLastIter);
5304
5305 // At the end of the preheader, prepare for calling the "init" function by
5306 // storing the current loop bounds into the allocated space. A canonical loop
5307 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5308 // and produces an inclusive upper bound.
5309 BasicBlock *PreHeader = CLI->getPreheader();
5310 Builder.SetInsertPoint(PreHeader->getTerminator());
5311 Constant *One = ConstantInt::get(IVTy, 1);
5312 Builder.CreateStore(One, PLowerBound);
5313 Value *UpperBound = CLI->getTripCount();
5314 Builder.CreateStore(UpperBound, PUpperBound);
5315 Builder.CreateStore(One, PStride);
5316
5317 BasicBlock *Header = CLI->getHeader();
5318 BasicBlock *Exit = CLI->getExit();
5319 BasicBlock *Cond = CLI->getCond();
5320 BasicBlock *Latch = CLI->getLatch();
5321 InsertPointTy AfterIP = CLI->getAfterIP();
5322
5323 // The CLI will be "broken" in the code below, as the loop is no longer
5324 // a valid canonical loop.
5325
5326 if (!Chunk)
5327 Chunk = One;
5328
5329 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5330
5331 Constant *SchedulingType =
5332 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5333
5334 // Call the "init" function.
5335 Builder.CreateCall(DynamicInit,
5336 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5337 UpperBound, /* step */ One, Chunk});
5338
5339 // An outer loop around the existing one.
5340 BasicBlock *OuterCond = BasicBlock::Create(
5341 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5342 PreHeader->getParent());
5343 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5344 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5345 Value *Res =
5346 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5347 PLowerBound, PUpperBound, PStride});
5348 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5349 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5350 Value *LowerBound =
5351 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5352 Builder.CreateCondBr(MoreWork, Header, Exit);
5353
5354 // Change PHI-node in loop header to use outer cond rather than preheader,
5355 // and set IV to the LowerBound.
5356 Instruction *Phi = &Header->front();
5357 auto *PI = cast<PHINode>(Phi);
5358 PI->setIncomingBlock(0, OuterCond);
5359 PI->setIncomingValue(0, LowerBound);
5360
5361 // Then set the pre-header to jump to the OuterCond
5362 Instruction *Term = PreHeader->getTerminator();
5363 auto *Br = cast<BranchInst>(Term);
5364 Br->setSuccessor(0, OuterCond);
5365
5366 // Modify the inner condition:
5367 // * Use the UpperBound returned from the DynamicNext call.
5368 // * jump to the loop outer loop when done with one of the inner loops.
5369 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5370 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5371 Instruction *Comp = &*Builder.GetInsertPoint();
5372 auto *CI = cast<CmpInst>(Comp);
5373 CI->setOperand(1, UpperBound);
5374 // Redirect the inner exit to branch to outer condition.
5375 Instruction *Branch = &Cond->back();
5376 auto *BI = cast<BranchInst>(Branch);
5377 assert(BI->getSuccessor(1) == Exit);
5378 BI->setSuccessor(1, OuterCond);
5379
5380 // Call the "fini" function if "ordered" is present in wsloop directive.
5381 if (Ordered) {
5382 Builder.SetInsertPoint(&Latch->back());
5383 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5384 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5385 }
5386
5387 // Add the barrier if requested.
5388 if (NeedsBarrier) {
5389 Builder.SetInsertPoint(&Exit->back());
5390 InsertPointOrErrorTy BarrierIP =
5391 createBarrier(LocationDescription(Builder.saveIP(), DL),
5392 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5393 /* CheckCancelFlag */ false);
5394 if (!BarrierIP)
5395 return BarrierIP.takeError();
5396 }
5397
5398 CLI->invalidate();
5399 return AfterIP;
5400}
5401
5402/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5403/// after this \p OldTarget will be orphaned.
5405 BasicBlock *NewTarget, DebugLoc DL) {
5406 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5407 redirectTo(Pred, NewTarget, DL);
5408}
5409
5410/// Determine which blocks in \p BBs are reachable from outside and remove the
5411/// ones that are not reachable from the function.
5414 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5415 for (Use &U : BB->uses()) {
5416 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5417 if (!UseInst)
5418 continue;
5419 if (BBsToErase.count(UseInst->getParent()))
5420 continue;
5421 return true;
5422 }
5423 return false;
5424 };
5425
5426 while (BBsToErase.remove_if(HasRemainingUses)) {
5427 // Try again if anything was removed.
5428 }
5429
5430 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5431 DeleteDeadBlocks(BBVec);
5432}
5433
5434CanonicalLoopInfo *
5435OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5436 InsertPointTy ComputeIP) {
5437 assert(Loops.size() >= 1 && "At least one loop required");
5438 size_t NumLoops = Loops.size();
5439
5440 // Nothing to do if there is already just one loop.
5441 if (NumLoops == 1)
5442 return Loops.front();
5443
5444 CanonicalLoopInfo *Outermost = Loops.front();
5445 CanonicalLoopInfo *Innermost = Loops.back();
5446 BasicBlock *OrigPreheader = Outermost->getPreheader();
5447 BasicBlock *OrigAfter = Outermost->getAfter();
5448 Function *F = OrigPreheader->getParent();
5449
5450 // Loop control blocks that may become orphaned later.
5451 SmallVector<BasicBlock *, 12> OldControlBBs;
5452 OldControlBBs.reserve(6 * Loops.size());
5453 for (CanonicalLoopInfo *Loop : Loops)
5454 Loop->collectControlBlocks(OldControlBBs);
5455
5456 // Setup the IRBuilder for inserting the trip count computation.
5457 Builder.SetCurrentDebugLocation(DL);
5458 if (ComputeIP.isSet())
5459 Builder.restoreIP(ComputeIP);
5460 else
5461 Builder.restoreIP(Outermost->getPreheaderIP());
5462
5463 // Derive the collapsed' loop trip count.
5464 // TODO: Find common/largest indvar type.
5465 Value *CollapsedTripCount = nullptr;
5466 for (CanonicalLoopInfo *L : Loops) {
5467 assert(L->isValid() &&
5468 "All loops to collapse must be valid canonical loops");
5469 Value *OrigTripCount = L->getTripCount();
5470 if (!CollapsedTripCount) {
5471 CollapsedTripCount = OrigTripCount;
5472 continue;
5473 }
5474
5475 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5476 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5477 }
5478
5479 // Create the collapsed loop control flow.
5480 CanonicalLoopInfo *Result =
5481 createLoopSkeleton(DL, CollapsedTripCount, F,
5482 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5483
5484 // Build the collapsed loop body code.
5485 // Start with deriving the input loop induction variables from the collapsed
5486 // one, using a divmod scheme. To preserve the original loops' order, the
5487 // innermost loop use the least significant bits.
5488 Builder.restoreIP(Result->getBodyIP());
5489
5490 Value *Leftover = Result->getIndVar();
5491 SmallVector<Value *> NewIndVars;
5492 NewIndVars.resize(NumLoops);
5493 for (int i = NumLoops - 1; i >= 1; --i) {
5494 Value *OrigTripCount = Loops[i]->getTripCount();
5495
5496 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5497 NewIndVars[i] = NewIndVar;
5498
5499 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5500 }
5501 // Outermost loop gets all the remaining bits.
5502 NewIndVars[0] = Leftover;
5503
5504 // Construct the loop body control flow.
5505 // We progressively construct the branch structure following in direction of
5506 // the control flow, from the leading in-between code, the loop nest body, the
5507 // trailing in-between code, and rejoining the collapsed loop's latch.
5508 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5509 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5510 // its predecessors as sources.
5511 BasicBlock *ContinueBlock = Result->getBody();
5512 BasicBlock *ContinuePred = nullptr;
5513 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5514 BasicBlock *NextSrc) {
5515 if (ContinueBlock)
5516 redirectTo(ContinueBlock, Dest, DL);
5517 else
5518 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5519
5520 ContinueBlock = nullptr;
5521 ContinuePred = NextSrc;
5522 };
5523
5524 // The code before the nested loop of each level.
5525 // Because we are sinking it into the nest, it will be executed more often
5526 // that the original loop. More sophisticated schemes could keep track of what
5527 // the in-between code is and instantiate it only once per thread.
5528 for (size_t i = 0; i < NumLoops - 1; ++i)
5529 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5530
5531 // Connect the loop nest body.
5532 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5533
5534 // The code after the nested loop at each level.
5535 for (size_t i = NumLoops - 1; i > 0; --i)
5536 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5537
5538 // Connect the finished loop to the collapsed loop latch.
5539 ContinueWith(Result->getLatch(), nullptr);
5540
5541 // Replace the input loops with the new collapsed loop.
5542 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5543 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5544
5545 // Replace the input loop indvars with the derived ones.
5546 for (size_t i = 0; i < NumLoops; ++i)
5547 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5548
5549 // Remove unused parts of the input loops.
5550 removeUnusedBlocksFromParent(OldControlBBs);
5551
5552 for (CanonicalLoopInfo *L : Loops)
5553 L->invalidate();
5554
5555#ifndef NDEBUG
5556 Result->assertOK();
5557#endif
5558 return Result;
5559}
5560
5561std::vector<CanonicalLoopInfo *>
5562OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5563 ArrayRef<Value *> TileSizes) {
5564 assert(TileSizes.size() == Loops.size() &&
5565 "Must pass as many tile sizes as there are loops");
5566 int NumLoops = Loops.size();
5567 assert(NumLoops >= 1 && "At least one loop to tile required");
5568
5569 CanonicalLoopInfo *OutermostLoop = Loops.front();
5570 CanonicalLoopInfo *InnermostLoop = Loops.back();
5571 Function *F = OutermostLoop->getBody()->getParent();
5572 BasicBlock *InnerEnter = InnermostLoop->getBody();
5573 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5574
5575 // Loop control blocks that may become orphaned later.
5576 SmallVector<BasicBlock *, 12> OldControlBBs;
5577 OldControlBBs.reserve(6 * Loops.size());
5578 for (CanonicalLoopInfo *Loop : Loops)
5579 Loop->collectControlBlocks(OldControlBBs);
5580
5581 // Collect original trip counts and induction variable to be accessible by
5582 // index. Also, the structure of the original loops is not preserved during
5583 // the construction of the tiled loops, so do it before we scavenge the BBs of
5584 // any original CanonicalLoopInfo.
5585 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5586 for (CanonicalLoopInfo *L : Loops) {
5587 assert(L->isValid() && "All input loops must be valid canonical loops");
5588 OrigTripCounts.push_back(L->getTripCount());
5589 OrigIndVars.push_back(L->getIndVar());
5590 }
5591
5592 // Collect the code between loop headers. These may contain SSA definitions
5593 // that are used in the loop nest body. To be usable with in the innermost
5594 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5595 // these instructions may be executed more often than before the tiling.
5596 // TODO: It would be sufficient to only sink them into body of the
5597 // corresponding tile loop.
5599 for (int i = 0; i < NumLoops - 1; ++i) {
5600 CanonicalLoopInfo *Surrounding = Loops[i];
5601 CanonicalLoopInfo *Nested = Loops[i + 1];
5602
5603 BasicBlock *EnterBB = Surrounding->getBody();
5604 BasicBlock *ExitBB = Nested->getHeader();
5605 InbetweenCode.emplace_back(EnterBB, ExitBB);
5606 }
5607
5608 // Compute the trip counts of the floor loops.
5609 Builder.SetCurrentDebugLocation(DL);
5610 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5611 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5612 for (int i = 0; i < NumLoops; ++i) {
5613 Value *TileSize = TileSizes[i];
5614 Value *OrigTripCount = OrigTripCounts[i];
5615 Type *IVType = OrigTripCount->getType();
5616
5617 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5618 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5619
5620 // 0 if tripcount divides the tilesize, 1 otherwise.
5621 // 1 means we need an additional iteration for a partial tile.
5622 //
5623 // Unfortunately we cannot just use the roundup-formula
5624 // (tripcount + tilesize - 1)/tilesize
5625 // because the summation might overflow. We do not want introduce undefined
5626 // behavior when the untiled loop nest did not.
5627 Value *FloorTripOverflow =
5628 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5629
5630 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5631 Value *FloorTripCount =
5632 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5633 "omp_floor" + Twine(i) + ".tripcount", true);
5634
5635 // Remember some values for later use.
5636 FloorCompleteCount.push_back(FloorCompleteTripCount);
5637 FloorCount.push_back(FloorTripCount);
5638 FloorRems.push_back(FloorTripRem);
5639 }
5640
5641 // Generate the new loop nest, from the outermost to the innermost.
5642 std::vector<CanonicalLoopInfo *> Result;
5643 Result.reserve(NumLoops * 2);
5644
5645 // The basic block of the surrounding loop that enters the nest generated
5646 // loop.
5647 BasicBlock *Enter = OutermostLoop->getPreheader();
5648
5649 // The basic block of the surrounding loop where the inner code should
5650 // continue.
5651 BasicBlock *Continue = OutermostLoop->getAfter();
5652
5653 // Where the next loop basic block should be inserted.
5654 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5655
5656 auto EmbeddNewLoop =
5657 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5658 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5659 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5660 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5661 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5662 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5663
5664 // Setup the position where the next embedded loop connects to this loop.
5665 Enter = EmbeddedLoop->getBody();
5666 Continue = EmbeddedLoop->getLatch();
5667 OutroInsertBefore = EmbeddedLoop->getLatch();
5668 return EmbeddedLoop;
5669 };
5670
5671 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5672 const Twine &NameBase) {
5673 for (auto P : enumerate(TripCounts)) {
5674 CanonicalLoopInfo *EmbeddedLoop =
5675 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5676 Result.push_back(EmbeddedLoop);
5677 }
5678 };
5679
5680 EmbeddNewLoops(FloorCount, "floor");
5681
5682 // Within the innermost floor loop, emit the code that computes the tile
5683 // sizes.
5684 Builder.SetInsertPoint(Enter->getTerminator());
5685 SmallVector<Value *, 4> TileCounts;
5686 for (int i = 0; i < NumLoops; ++i) {
5687 CanonicalLoopInfo *FloorLoop = Result[i];
5688 Value *TileSize = TileSizes[i];
5689
5690 Value *FloorIsEpilogue =
5691 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5692 Value *TileTripCount =
5693 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5694
5695 TileCounts.push_back(TileTripCount);
5696 }
5697
5698 // Create the tile loops.
5699 EmbeddNewLoops(TileCounts, "tile");
5700
5701 // Insert the inbetween code into the body.
5702 BasicBlock *BodyEnter = Enter;
5703 BasicBlock *BodyEntered = nullptr;
5704 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5705 BasicBlock *EnterBB = P.first;
5706 BasicBlock *ExitBB = P.second;
5707
5708 if (BodyEnter)
5709 redirectTo(BodyEnter, EnterBB, DL);
5710 else
5711 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5712
5713 BodyEnter = nullptr;
5714 BodyEntered = ExitBB;
5715 }
5716
5717 // Append the original loop nest body into the generated loop nest body.
5718 if (BodyEnter)
5719 redirectTo(BodyEnter, InnerEnter, DL);
5720 else
5721 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5723
5724 // Replace the original induction variable with an induction variable computed
5725 // from the tile and floor induction variables.
5726 Builder.restoreIP(Result.back()->getBodyIP());
5727 for (int i = 0; i < NumLoops; ++i) {
5728 CanonicalLoopInfo *FloorLoop = Result[i];
5729 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5730 Value *OrigIndVar = OrigIndVars[i];
5731 Value *Size = TileSizes[i];
5732
5733 Value *Scale =
5734 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5735 Value *Shift =
5736 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5737 OrigIndVar->replaceAllUsesWith(Shift);
5738 }
5739
5740 // Remove unused parts of the original loops.
5741 removeUnusedBlocksFromParent(OldControlBBs);
5742
5743 for (CanonicalLoopInfo *L : Loops)
5744 L->invalidate();
5745
5746#ifndef NDEBUG
5747 for (CanonicalLoopInfo *GenL : Result)
5748 GenL->assertOK();
5749#endif
5750 return Result;
5751}
5752
5753/// Attach metadata \p Properties to the basic block described by \p BB. If the
5754/// basic block already has metadata, the basic block properties are appended.
5756 ArrayRef<Metadata *> Properties) {
5757 // Nothing to do if no property to attach.
5758 if (Properties.empty())
5759 return;
5760
5761 LLVMContext &Ctx = BB->getContext();
5762 SmallVector<Metadata *> NewProperties;
5763 NewProperties.push_back(nullptr);
5764
5765 // If the basic block already has metadata, prepend it to the new metadata.
5766 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5767 if (Existing)
5768 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5769
5770 append_range(NewProperties, Properties);
5771 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5772 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5773
5774 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5775}
5776
5777/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5778/// loop already has metadata, the loop properties are appended.
5779static void addLoopMetadata(CanonicalLoopInfo *Loop,
5780 ArrayRef<Metadata *> Properties) {
5781 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5782
5783 // Attach metadata to the loop's latch
5784 BasicBlock *Latch = Loop->getLatch();
5785 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5786 addBasicBlockMetadata(Latch, Properties);
5787}
5788
5789/// Attach llvm.access.group metadata to the memref instructions of \p Block
5790static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5791 LoopInfo &LI) {
5792 for (Instruction &I : *Block) {
5793 if (I.mayReadOrWriteMemory()) {
5794 // TODO: This instruction may already have access group from
5795 // other pragmas e.g. #pragma clang loop vectorize. Append
5796 // so that the existing metadata is not overwritten.
5797 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5798 }
5799 }
5800}
5801
5802void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5803 LLVMContext &Ctx = Builder.getContext();
5805 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5806 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5807}
5808
5809void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5810 LLVMContext &Ctx = Builder.getContext();
5812 Loop, {
5813 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5814 });
5815}
5816
5817void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5818 Value *IfCond, ValueToValueMapTy &VMap,
5819 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5820 const Twine &NamePrefix) {
5821 Function *F = CanonicalLoop->getFunction();
5822
5823 // We can't do
5824 // if (cond) {
5825 // simd_loop;
5826 // } else {
5827 // non_simd_loop;
5828 // }
5829 // because then the CanonicalLoopInfo would only point to one of the loops:
5830 // leading to other constructs operating on the same loop to malfunction.
5831 // Instead generate
5832 // while (...) {
5833 // if (cond) {
5834 // simd_body;
5835 // } else {
5836 // not_simd_body;
5837 // }
5838 // }
5839 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5840 // body at -O3
5841
5842 // Define where if branch should be inserted
5843 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5844
5845 // Create additional blocks for the if statement
5846 BasicBlock *Cond = SplitBeforeIt->getParent();
5847 llvm::LLVMContext &C = Cond->getContext();
5849 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5851 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5852
5853 // Create if condition branch.
5854 Builder.SetInsertPoint(SplitBeforeIt);
5855 Instruction *BrInstr =
5856 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5857 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5858 // Then block contains branch to omp loop body which needs to be vectorized
5859 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5860 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5861
5862 Builder.SetInsertPoint(ElseBlock);
5863
5864 // Clone loop for the else branch
5866
5867 SmallVector<BasicBlock *, 8> ExistingBlocks;
5868 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5869 ExistingBlocks.push_back(ThenBlock);
5870 ExistingBlocks.append(L->block_begin(), L->block_end());
5871 // Cond is the block that has the if clause condition
5872 // LoopCond is omp_loop.cond
5873 // LoopHeader is omp_loop.header
5874 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5875 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5876 assert(LoopCond && LoopHeader && "Invalid loop structure");
5877 for (BasicBlock *Block : ExistingBlocks) {
5878 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5879 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5880 continue;
5881 }
5882 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5883
5884 // fix name not to be omp.if.then
5885 if (Block == ThenBlock)
5886 NewBB->setName(NamePrefix + ".if.else");
5887
5888 NewBB->moveBefore(CanonicalLoop->getExit());
5889 VMap[Block] = NewBB;
5890 NewBlocks.push_back(NewBB);
5891 }
5892 remapInstructionsInBlocks(NewBlocks, VMap);
5893 Builder.CreateBr(NewBlocks.front());
5894
5895 // The loop latch must have only one predecessor. Currently it is branched to
5896 // from both the 'then' and 'else' branches.
5897 L->getLoopLatch()->splitBasicBlock(
5898 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5899
5900 // Ensure that the then block is added to the loop so we add the attributes in
5901 // the next step
5902 L->addBasicBlockToLoop(ThenBlock, LI);
5903}
5904
5905unsigned
5906OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5907 const StringMap<bool> &Features) {
5908 if (TargetTriple.isX86()) {
5909 if (Features.lookup("avx512f"))
5910 return 512;
5911 else if (Features.lookup("avx"))
5912 return 256;
5913 return 128;
5914 }
5915 if (TargetTriple.isPPC())
5916 return 128;
5917 if (TargetTriple.isWasm())
5918 return 128;
5919 return 0;
5920}
5921
5922void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5923 MapVector<Value *, Value *> AlignedVars,
5924 Value *IfCond, OrderKind Order,
5925 ConstantInt *Simdlen, ConstantInt *Safelen) {
5926 LLVMContext &Ctx = Builder.getContext();
5927
5928 Function *F = CanonicalLoop->getFunction();
5929
5930 // TODO: We should not rely on pass manager. Currently we use pass manager
5931 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5932 // object. We should have a method which returns all blocks between
5933 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5935 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5936 FAM.registerPass([]() { return LoopAnalysis(); });
5937 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5938
5939 LoopAnalysis LIA;
5940 LoopInfo &&LI = LIA.run(*F, FAM);
5941
5942 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5943 if (AlignedVars.size()) {
5944 InsertPointTy IP = Builder.saveIP();
5945 for (auto &AlignedItem : AlignedVars) {
5946 Value *AlignedPtr = AlignedItem.first;
5947 Value *Alignment = AlignedItem.second;
5948 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5949 Builder.SetInsertPoint(loadInst->getNextNode());
5950 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5951 Alignment);
5952 }
5953 Builder.restoreIP(IP);
5954 }
5955
5956 if (IfCond) {
5957 ValueToValueMapTy VMap;
5958 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5959 }
5960
5962
5963 // Get the basic blocks from the loop in which memref instructions
5964 // can be found.
5965 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5966 // preferably without running any passes.
5967 for (BasicBlock *Block : L->getBlocks()) {
5968 if (Block == CanonicalLoop->getCond() ||
5969 Block == CanonicalLoop->getHeader())
5970 continue;
5971 Reachable.insert(Block);
5972 }
5973
5974 SmallVector<Metadata *> LoopMDList;
5975
5976 // In presence of finite 'safelen', it may be unsafe to mark all
5977 // the memory instructions parallel, because loop-carried
5978 // dependences of 'safelen' iterations are possible.
5979 // If clause order(concurrent) is specified then the memory instructions
5980 // are marked parallel even if 'safelen' is finite.
5981 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5982 // Add access group metadata to memory-access instructions.
5983 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5984 for (BasicBlock *BB : Reachable)
5985 addSimdMetadata(BB, AccessGroup, LI);
5986 // TODO: If the loop has existing parallel access metadata, have
5987 // to combine two lists.
5988 LoopMDList.push_back(MDNode::get(
5989 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5990 }
5991
5992 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5993 // versions so we can't add the loop attributes in that case.
5994 if (IfCond) {
5995 // we can still add llvm.loop.parallel_access
5996 addLoopMetadata(CanonicalLoop, LoopMDList);
5997 return;
5998 }
5999
6000 // Use the above access group metadata to create loop level
6001 // metadata, which should be distinct for each loop.
6002 ConstantAsMetadata *BoolConst =
6004 LoopMDList.push_back(MDNode::get(
6005 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6006
6007 if (Simdlen || Safelen) {
6008 // If both simdlen and safelen clauses are specified, the value of the
6009 // simdlen parameter must be less than or equal to the value of the safelen
6010 // parameter. Therefore, use safelen only in the absence of simdlen.
6011 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6012 LoopMDList.push_back(
6013 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6014 ConstantAsMetadata::get(VectorizeWidth)}));
6015 }
6016
6017 addLoopMetadata(CanonicalLoop, LoopMDList);
6018}
6019
6020/// Create the TargetMachine object to query the backend for optimization
6021/// preferences.
6022///
6023/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6024/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6025/// needed for the LLVM pass pipline. We use some default options to avoid
6026/// having to pass too many settings from the frontend that probably do not
6027/// matter.
6028///
6029/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6030/// method. If we are going to use TargetMachine for more purposes, especially
6031/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6032/// might become be worth requiring front-ends to pass on their TargetMachine,
6033/// or at least cache it between methods. Note that while fontends such as Clang
6034/// have just a single main TargetMachine per translation unit, "target-cpu" and
6035/// "target-features" that determine the TargetMachine are per-function and can
6036/// be overrided using __attribute__((target("OPTIONS"))).
6037static std::unique_ptr<TargetMachine>
6039 Module *M = F->getParent();
6040
6041 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6042 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6043 const llvm::Triple &Triple = M->getTargetTriple();
6044
6045 std::string Error;
6047 if (!TheTarget)
6048 return {};
6049
6051 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6052 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6053 /*CodeModel=*/std::nullopt, OptLevel));
6054}
6055
6056/// Heuristically determine the best-performant unroll factor for \p CLI. This
6057/// depends on the target processor. We are re-using the same heuristics as the
6058/// LoopUnrollPass.
6059static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6060 Function *F = CLI->getFunction();
6061
6062 // Assume the user requests the most aggressive unrolling, even if the rest of
6063 // the code is optimized using a lower setting.
6065 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6066
6068 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6069 FAM.registerPass([]() { return AssumptionAnalysis(); });
6070 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6071 FAM.registerPass([]() { return LoopAnalysis(); });
6072 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6073 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6074 TargetIRAnalysis TIRA;
6075 if (TM)
6076 TIRA = TargetIRAnalysis(
6077 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6078 FAM.registerPass([&]() { return TIRA; });
6079
6080 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6082 ScalarEvolution &&SE = SEA.run(*F, FAM);
6084 DominatorTree &&DT = DTA.run(*F, FAM);
6085 LoopAnalysis LIA;
6086 LoopInfo &&LI = LIA.run(*F, FAM);
6088 AssumptionCache &&AC = ACT.run(*F, FAM);
6090
6091 Loop *L = LI.getLoopFor(CLI->getHeader());
6092 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6093
6095 L, SE, TTI,
6096 /*BlockFrequencyInfo=*/nullptr,
6097 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6098 /*UserThreshold=*/std::nullopt,
6099 /*UserCount=*/std::nullopt,
6100 /*UserAllowPartial=*/true,
6101 /*UserAllowRuntime=*/true,
6102 /*UserUpperBound=*/std::nullopt,
6103 /*UserFullUnrollMaxCount=*/std::nullopt);
6104
6105 UP.Force = true;
6106
6107 // Account for additional optimizations taking place before the LoopUnrollPass
6108 // would unroll the loop.
6111
6112 // Use normal unroll factors even if the rest of the code is optimized for
6113 // size.
6116
6117 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6118 << " Threshold=" << UP.Threshold << "\n"
6119 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6120 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6121 << " PartialOptSizeThreshold="
6122 << UP.PartialOptSizeThreshold << "\n");
6123
6124 // Disable peeling.
6127 /*UserAllowPeeling=*/false,
6128 /*UserAllowProfileBasedPeeling=*/false,
6129 /*UnrollingSpecficValues=*/false);
6130
6132 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6133
6134 // Assume that reads and writes to stack variables can be eliminated by
6135 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6136 // size.
6137 for (BasicBlock *BB : L->blocks()) {
6138 for (Instruction &I : *BB) {
6139 Value *Ptr;
6140 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6141 Ptr = Load->getPointerOperand();
6142 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6143 Ptr = Store->getPointerOperand();
6144 } else
6145 continue;
6146
6147 Ptr = Ptr->stripPointerCasts();
6148
6149 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6150 if (Alloca->getParent() == &F->getEntryBlock())
6151 EphValues.insert(&I);
6152 }
6153 }
6154 }
6155
6156 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6157
6158 // Loop is not unrollable if the loop contains certain instructions.
6159 if (!UCE.canUnroll()) {
6160 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6161 return 1;
6162 }
6163
6164 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6165 << "\n");
6166
6167 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6168 // be able to use it.
6169 int TripCount = 0;
6170 int MaxTripCount = 0;
6171 bool MaxOrZero = false;
6172 unsigned TripMultiple = 0;
6173
6174 bool UseUpperBound = false;
6175 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6176 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6177 UseUpperBound);
6178 unsigned Factor = UP.Count;
6179 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6180
6181 // This function returns 1 to signal to not unroll a loop.
6182 if (Factor == 0)
6183 return 1;
6184 return Factor;
6185}
6186
6187void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6188 int32_t Factor,
6189 CanonicalLoopInfo **UnrolledCLI) {
6190 assert(Factor >= 0 && "Unroll factor must not be negative");
6191
6192 Function *F = Loop->getFunction();
6193 LLVMContext &Ctx = F->getContext();
6194
6195 // If the unrolled loop is not used for another loop-associated directive, it
6196 // is sufficient to add metadata for the LoopUnrollPass.
6197 if (!UnrolledCLI) {
6198 SmallVector<Metadata *, 2> LoopMetadata;
6199 LoopMetadata.push_back(
6200 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6201
6202 if (Factor >= 1) {
6204 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6205 LoopMetadata.push_back(MDNode::get(
6206 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6207 }
6208
6209 addLoopMetadata(Loop, LoopMetadata);
6210 return;
6211 }
6212
6213 // Heuristically determine the unroll factor.
6214 if (Factor == 0)
6216
6217 // No change required with unroll factor 1.
6218 if (Factor == 1) {
6219 *UnrolledCLI = Loop;
6220 return;
6221 }
6222
6223 assert(Factor >= 2 &&
6224 "unrolling only makes sense with a factor of 2 or larger");
6225
6226 Type *IndVarTy = Loop->getIndVarType();
6227
6228 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6229 // unroll the inner loop.
6230 Value *FactorVal =
6231 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6232 /*isSigned=*/false));
6233 std::vector<CanonicalLoopInfo *> LoopNest =
6234 tileLoops(DL, {Loop}, {FactorVal});
6235 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6236 *UnrolledCLI = LoopNest[0];
6237 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6238
6239 // LoopUnrollPass can only fully unroll loops with constant trip count.
6240 // Unroll by the unroll factor with a fallback epilog for the remainder
6241 // iterations if necessary.
6243 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6245 InnerLoop,
6246 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6248 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6249
6250#ifndef NDEBUG
6251 (*UnrolledCLI)->assertOK();
6252#endif
6253}
6254
6255OpenMPIRBuilder::InsertPointTy
6256OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6257 llvm::Value *BufSize, llvm::Value *CpyBuf,
6258 llvm::Value *CpyFn, llvm::Value *DidIt) {
6259 if (!updateToLocation(Loc))
6260 return Loc.IP;
6261
6262 uint32_t SrcLocStrSize;
6263 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6264 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6265 Value *ThreadId = getOrCreateThreadID(Ident);
6266
6267 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6268
6269 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6270
6271 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6272 Builder.CreateCall(Fn, Args);
6273
6274 return Builder.saveIP();
6275}
6276
6277OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6278 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6279 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6281
6282 if (!updateToLocation(Loc))
6283 return Loc.IP;
6284
6285 // If needed allocate and initialize `DidIt` with 0.
6286 // DidIt: flag variable: 1=single thread; 0=not single thread.
6287 llvm::Value *DidIt = nullptr;
6288 if (!CPVars.empty()) {
6289 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6290 Builder.CreateStore(Builder.getInt32(0), DidIt);
6291 }
6292
6293 Directive OMPD = Directive::OMPD_single;
6294 uint32_t SrcLocStrSize;
6295 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6296 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6297 Value *ThreadId = getOrCreateThreadID(Ident);
6298 Value *Args[] = {Ident, ThreadId};
6299
6300 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6301 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6302
6303 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6304 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6305
6306 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6307 if (Error Err = FiniCB(IP))
6308 return Err;
6309
6310 // The thread that executes the single region must set `DidIt` to 1.
6311 // This is used by __kmpc_copyprivate, to know if the caller is the
6312 // single thread or not.
6313 if (DidIt)
6314 Builder.CreateStore(Builder.getInt32(1), DidIt);
6315
6316 return Error::success();
6317 };
6318
6319 // generates the following:
6320 // if (__kmpc_single()) {
6321 // .... single region ...
6322 // __kmpc_end_single
6323 // }
6324 // __kmpc_copyprivate
6325 // __kmpc_barrier
6326
6327 InsertPointOrErrorTy AfterIP =
6328 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6329 /*Conditional*/ true,
6330 /*hasFinalize*/ true);
6331 if (!AfterIP)
6332 return AfterIP.takeError();
6333
6334 if (DidIt) {
6335 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6336 // NOTE BufSize is currently unused, so just pass 0.
6337 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6338 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6339 CPFuncs[I], DidIt);
6340 // NOTE __kmpc_copyprivate already inserts a barrier
6341 } else if (!IsNowait) {
6342 InsertPointOrErrorTy AfterIP =
6343 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6344 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6345 /* CheckCancelFlag */ false);
6346 if (!AfterIP)
6347 return AfterIP.takeError();
6348 }
6349 return Builder.saveIP();
6350}
6351
6352OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6353 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6354 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6355
6356 if (!updateToLocation(Loc))
6357 return Loc.IP;
6358
6359 Directive OMPD = Directive::OMPD_critical;
6360 uint32_t SrcLocStrSize;
6361 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6362 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6363 Value *ThreadId = getOrCreateThreadID(Ident);
6364 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6365 Value *Args[] = {Ident, ThreadId, LockVar};
6366
6367 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6368 Function *RTFn = nullptr;
6369 if (HintInst) {
6370 // Add Hint to entry Args and create call
6371 EnterArgs.push_back(HintInst);
6372 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6373 } else {
6374 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6375 }
6376 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6377
6378 Function *ExitRTLFn =
6379 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6380 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6381
6382 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6383 /*Conditional*/ false, /*hasFinalize*/ true);
6384}
6385
6386OpenMPIRBuilder::InsertPointTy
6387OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6388 InsertPointTy AllocaIP, unsigned NumLoops,
6389 ArrayRef<llvm::Value *> StoreValues,
6390 const Twine &Name, bool IsDependSource) {
6391 assert(
6392 llvm::all_of(StoreValues,
6393 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6394 "OpenMP runtime requires depend vec with i64 type");
6395
6396 if (!updateToLocation(Loc))
6397 return Loc.IP;
6398
6399 // Allocate space for vector and generate alloc instruction.
6400 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6401 Builder.restoreIP(AllocaIP);
6402 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6403 ArgsBase->setAlignment(Align(8));
6404 updateToLocation(Loc);
6405
6406 // Store the index value with offset in depend vector.
6407 for (unsigned I = 0; I < NumLoops; ++I) {
6408 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6409 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6410 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6411 STInst->setAlignment(Align(8));
6412 }
6413
6414 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6415 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6416
6417 uint32_t SrcLocStrSize;
6418 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6419 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6420 Value *ThreadId = getOrCreateThreadID(Ident);
6421 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6422
6423 Function *RTLFn = nullptr;
6424 if (IsDependSource)
6425 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6426 else
6427 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6428 Builder.CreateCall(RTLFn, Args);
6429
6430 return Builder.saveIP();
6431}
6432
6433OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6434 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6435 FinalizeCallbackTy FiniCB, bool IsThreads) {
6436 if (!updateToLocation(Loc))
6437 return Loc.IP;
6438
6439 Directive OMPD = Directive::OMPD_ordered;
6440 Instruction *EntryCall = nullptr;
6441 Instruction *ExitCall = nullptr;
6442
6443 if (IsThreads) {
6444 uint32_t SrcLocStrSize;
6445 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6446 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6447 Value *ThreadId = getOrCreateThreadID(Ident);
6448 Value *Args[] = {Ident, ThreadId};
6449
6450 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6451 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6452
6453 Function *ExitRTLFn =
6454 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6455 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6456 }
6457
6458 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6459 /*Conditional*/ false, /*hasFinalize*/ true);
6460}
6461
6462OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6463 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6464 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6465 bool HasFinalize, bool IsCancellable) {
6466
6467 if (HasFinalize)
6468 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6469
6470 // Create inlined region's entry and body blocks, in preparation
6471 // for conditional creation
6472 BasicBlock *EntryBB = Builder.GetInsertBlock();
6473 Instruction *SplitPos = EntryBB->getTerminator();
6474 if (!isa_and_nonnull<BranchInst>(SplitPos))
6475 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6476 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6477 BasicBlock *FiniBB =
6478 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6479
6480 Builder.SetInsertPoint(EntryBB->getTerminator());
6481 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6482
6483 // generate body
6484 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6485 /* CodeGenIP */ Builder.saveIP()))
6486 return Err;
6487
6488 // emit exit call and do any needed finalization.
6489 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6490 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6491 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6492 "Unexpected control flow graph state!!");
6493 InsertPointOrErrorTy AfterIP =
6494 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6495 if (!AfterIP)
6496 return AfterIP.takeError();
6497 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6498 "Unexpected Control Flow State!");
6500
6501 // If we are skipping the region of a non conditional, remove the exit
6502 // block, and clear the builder's insertion point.
6503 assert(SplitPos->getParent() == ExitBB &&
6504 "Unexpected Insertion point location!");
6505 auto merged = MergeBlockIntoPredecessor(ExitBB);
6506 BasicBlock *ExitPredBB = SplitPos->getParent();
6507 auto InsertBB = merged ? ExitPredBB : ExitBB;
6508 if (!isa_and_nonnull<BranchInst>(SplitPos))
6509 SplitPos->eraseFromParent();
6510 Builder.SetInsertPoint(InsertBB);
6511
6512 return Builder.saveIP();
6513}
6514
6515OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6516 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6517 // if nothing to do, Return current insertion point.
6518 if (!Conditional || !EntryCall)
6519 return Builder.saveIP();
6520
6521 BasicBlock *EntryBB = Builder.GetInsertBlock();
6522 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6523 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6524 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6525
6526 // Emit thenBB and set the Builder's insertion point there for
6527 // body generation next. Place the block after the current block.
6528 Function *CurFn = EntryBB->getParent();
6529 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6530
6531 // Move Entry branch to end of ThenBB, and replace with conditional
6532 // branch (If-stmt)
6533 Instruction *EntryBBTI = EntryBB->getTerminator();
6534 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6535 EntryBBTI->removeFromParent();
6536 Builder.SetInsertPoint(UI);
6537 Builder.Insert(EntryBBTI);
6538 UI->eraseFromParent();
6539 Builder.SetInsertPoint(ThenBB->getTerminator());
6540
6541 // return an insertion point to ExitBB.
6542 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6543}
6544
6545OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6546 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6547 bool HasFinalize) {
6548
6549 Builder.restoreIP(FinIP);
6550
6551 // If there is finalization to do, emit it before the exit call
6552 if (HasFinalize) {
6553 assert(!FinalizationStack.empty() &&
6554 "Unexpected finalization stack state!");
6555
6556 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6557 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6558
6559 if (Error Err = Fi.FiniCB(FinIP))
6560 return Err;
6561
6562 BasicBlock *FiniBB = FinIP.getBlock();
6563 Instruction *FiniBBTI = FiniBB->getTerminator();
6564
6565 // set Builder IP for call creation
6566 Builder.SetInsertPoint(FiniBBTI);
6567 }
6568
6569 if (!ExitCall)
6570 return Builder.saveIP();
6571
6572 // place the Exitcall as last instruction before Finalization block terminator
6573 ExitCall->removeFromParent();
6574 Builder.Insert(ExitCall);
6575
6576 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6577 ExitCall->getIterator());
6578}
6579
6580OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6581 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6582 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6583 if (!IP.isSet())
6584 return IP;
6585
6586 IRBuilder<>::InsertPointGuard IPG(Builder);
6587
6588 // creates the following CFG structure
6589 // OMP_Entry : (MasterAddr != PrivateAddr)?
6590 // F T
6591 // | \
6592 // | copin.not.master
6593 // | /
6594 // v /
6595 // copyin.not.master.end
6596 // |
6597 // v
6598 // OMP.Entry.Next
6599
6600 BasicBlock *OMP_Entry = IP.getBlock();
6601 Function *CurFn = OMP_Entry->getParent();
6602 BasicBlock *CopyBegin =
6603 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6604 BasicBlock *CopyEnd = nullptr;
6605
6606 // If entry block is terminated, split to preserve the branch to following
6607 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6608 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6609 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6610 "copyin.not.master.end");
6611 OMP_Entry->getTerminator()->eraseFromParent();
6612 } else {
6613 CopyEnd =
6614 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6615 }
6616
6617 Builder.SetInsertPoint(OMP_Entry);
6618 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6619 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6620 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6621 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6622
6623 Builder.SetInsertPoint(CopyBegin);
6624 if (BranchtoEnd)
6625 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6626
6627 return Builder.saveIP();
6628}
6629
6630CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6632 std::string Name) {
6633 IRBuilder<>::InsertPointGuard IPG(Builder);
6634 updateToLocation(Loc);
6635
6636 uint32_t SrcLocStrSize;
6637 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6638 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6639 Value *ThreadId = getOrCreateThreadID(Ident);
6640 Value *Args[] = {ThreadId, Size, Allocator};
6641
6642 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6643
6644 return Builder.CreateCall(Fn, Args, Name);
6645}
6646
6647CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6648 Value *Addr, Value *Allocator,
6649 std::string Name) {
6650 IRBuilder<>::InsertPointGuard IPG(Builder);
6651 updateToLocation(Loc);
6652
6653 uint32_t SrcLocStrSize;
6654 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6655 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6656 Value *ThreadId = getOrCreateThreadID(Ident);
6657 Value *Args[] = {ThreadId, Addr, Allocator};
6658 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6659 return Builder.CreateCall(Fn, Args, Name);
6660}
6661
6662CallInst *OpenMPIRBuilder::createOMPInteropInit(
6663 const LocationDescription &Loc, Value *InteropVar,
6664 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6665 Value *DependenceAddress, bool HaveNowaitClause) {
6666 IRBuilder<>::InsertPointGuard IPG(Builder);
6667 updateToLocation(Loc);
6668
6669 uint32_t SrcLocStrSize;
6670 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6671 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6672 Value *ThreadId = getOrCreateThreadID(Ident);
6673 if (Device == nullptr)
6675 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6676 if (NumDependences == nullptr) {
6677 NumDependences = ConstantInt::get(Int32, 0);
6678 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6679 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6680 }
6681 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6682 Value *Args[] = {
6683 Ident, ThreadId, InteropVar, InteropTypeVal,
6684 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6685
6686 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6687
6688 return Builder.CreateCall(Fn, Args);
6689}
6690
6691CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6692 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6693 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6694 IRBuilder<>::InsertPointGuard IPG(Builder);
6695 updateToLocation(Loc);
6696
6697 uint32_t SrcLocStrSize;
6698 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6699 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6700 Value *ThreadId = getOrCreateThreadID(Ident);
6701 if (Device == nullptr)
6703 if (NumDependences == nullptr) {
6704 NumDependences = ConstantInt::get(Int32, 0);
6705 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6706 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6707 }
6708 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6709 Value *Args[] = {
6710 Ident, ThreadId, InteropVar, Device,
6711 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6712
6713 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6714
6715 return Builder.CreateCall(Fn, Args);
6716}
6717
6718CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6719 Value *InteropVar, Value *Device,
6720 Value *NumDependences,
6721 Value *DependenceAddress,
6722 bool HaveNowaitClause) {
6723 IRBuilder<>::InsertPointGuard IPG(Builder);
6724 updateToLocation(Loc);
6725 uint32_t SrcLocStrSize;
6726 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6727 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6728 Value *ThreadId = getOrCreateThreadID(Ident);
6729 if (Device == nullptr)
6731 if (NumDependences == nullptr) {
6732 NumDependences = ConstantInt::get(Int32, 0);
6733 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6734 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6735 }
6736 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6737 Value *Args[] = {
6738 Ident, ThreadId, InteropVar, Device,
6739 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6740
6741 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6742
6743 return Builder.CreateCall(Fn, Args);
6744}
6745
6746CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6747 const LocationDescription &Loc, llvm::Value *Pointer,
6748 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6749 IRBuilder<>::InsertPointGuard IPG(Builder);
6750 updateToLocation(Loc);
6751
6752 uint32_t SrcLocStrSize;
6753 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6754 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6755 Value *ThreadId = getOrCreateThreadID(Ident);
6756 Constant *ThreadPrivateCache =
6757 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6758 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6759
6760 Function *Fn =
6761 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6762
6763 return Builder.CreateCall(Fn, Args);
6764}
6765
6766OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6767 const LocationDescription &Loc,
6768 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6769 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6770 "expected num_threads and num_teams to be specified");
6771
6772 if (!updateToLocation(Loc))
6773 return Loc.IP;
6774
6775 uint32_t SrcLocStrSize;
6776 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6777 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6778 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6779 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6780 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6781 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6782 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6783
6784 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6785 Function *Kernel = DebugKernelWrapper;
6786
6787 // We need to strip the debug prefix to get the correct kernel name.
6788 StringRef KernelName = Kernel->getName();
6789 const std::string DebugPrefix = "_debug__";
6790 if (KernelName.ends_with(DebugPrefix)) {
6791 KernelName = KernelName.drop_back(DebugPrefix.length());
6792 Kernel = M.getFunction(KernelName);
6793 assert(Kernel && "Expected the real kernel to exist");
6794 }
6795
6796 // Manifest the launch configuration in the metadata matching the kernel
6797 // environment.
6798 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6799 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6800
6801 // If MaxThreads not set, select the maximum between the default workgroup
6802 // size and the MinThreads value.
6803 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6804 if (MaxThreadsVal < 0)
6805 MaxThreadsVal = std::max(
6806 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6807
6808 if (MaxThreadsVal > 0)
6809 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6810
6811 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6813 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6814 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6815 Constant *ReductionDataSize =
6816 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6817 Constant *ReductionBufferLength =
6818 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6819
6820 Function *Fn = getOrCreateRuntimeFunctionPtr(
6821 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6822 const DataLayout &DL = Fn->getDataLayout();
6823
6824 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6825 Constant *DynamicEnvironmentInitializer =
6826 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6827 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6828 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6829 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6830 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6831 DL.getDefaultGlobalsAddressSpace());
6832 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6833
6834 Constant *DynamicEnvironment =
6835 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6836 ? DynamicEnvironmentGV
6837 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6838 DynamicEnvironmentPtr);
6839
6840 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6841 ConfigurationEnvironment, {
6842 UseGenericStateMachineVal,
6843 MayUseNestedParallelismVal,
6844 IsSPMDVal,
6845 MinThreads,
6846 MaxThreads,
6847 MinTeams,
6848 MaxTeams,
6849 ReductionDataSize,
6850 ReductionBufferLength,
6851 });
6852 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6853 KernelEnvironment, {
6854 ConfigurationEnvironmentInitializer,
6855 Ident,
6856 DynamicEnvironment,
6857 });
6858 std::string KernelEnvironmentName =
6859 (KernelName + "_kernel_environment").str();
6860 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6861 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6862 KernelEnvironmentInitializer, KernelEnvironmentName,
6863 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6864 DL.getDefaultGlobalsAddressSpace());
6865 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6866
6867 Constant *KernelEnvironment =
6868 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6869 ? KernelEnvironmentGV
6870 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6871 KernelEnvironmentPtr);
6872 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6873 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6874 KernelLaunchEnvironment =
6875 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6876 ? KernelLaunchEnvironment
6877 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6878 KernelLaunchEnvParamTy);
6879 CallInst *ThreadKind =
6880 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6881
6882 Value *ExecUserCode = Builder.CreateICmpEQ(
6883 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6884 "exec_user_code");
6885
6886 // ThreadKind = __kmpc_target_init(...)
6887 // if (ThreadKind == -1)
6888 // user_code
6889 // else
6890 // return;
6891
6892 auto *UI = Builder.CreateUnreachable();
6893 BasicBlock *CheckBB = UI->getParent();
6894 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6895
6896 BasicBlock *WorkerExitBB = BasicBlock::Create(
6897 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6898 Builder.SetInsertPoint(WorkerExitBB);
6899 Builder.CreateRetVoid();
6900
6901 auto *CheckBBTI = CheckBB->getTerminator();
6902 Builder.SetInsertPoint(CheckBBTI);
6903 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6904
6905 CheckBBTI->eraseFromParent();
6906 UI->eraseFromParent();
6907
6908 // Continue in the "user_code" block, see diagram above and in
6909 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6910 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6911}
6912
6913void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6914 int32_t TeamsReductionDataSize,
6915 int32_t TeamsReductionBufferLength) {
6916 if (!updateToLocation(Loc))
6917 return;
6918
6919 Function *Fn = getOrCreateRuntimeFunctionPtr(
6920 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6921
6922 Builder.CreateCall(Fn, {});
6923
6924 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6925 return;
6926
6927 Function *Kernel = Builder.GetInsertBlock()->getParent();
6928 // We need to strip the debug prefix to get the correct kernel name.
6929 StringRef KernelName = Kernel->getName();
6930 const std::string DebugPrefix = "_debug__";
6931 if (KernelName.ends_with(DebugPrefix))
6932 KernelName = KernelName.drop_back(DebugPrefix.length());
6933 auto *KernelEnvironmentGV =
6934 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6935 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6936 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6937 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6938 KernelEnvironmentInitializer,
6939 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6940 NewInitializer = ConstantFoldInsertValueInstruction(
6941 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6942 {0, 8});
6943 KernelEnvironmentGV->setInitializer(NewInitializer);
6944}
6945
6946static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6947 bool Min) {
6948 if (Kernel.hasFnAttribute(Name)) {
6949 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6950 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6951 }
6952 Kernel.addFnAttr(Name, llvm::utostr(Value));
6953}
6954
6955std::pair<int32_t, int32_t>
6956OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6957 int32_t ThreadLimit =
6958 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6959
6960 if (T.isAMDGPU()) {
6961 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6962 if (!Attr.isValid() || !Attr.isStringAttribute())
6963 return {0, ThreadLimit};
6964 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6965 int32_t LB, UB;
6966 if (!llvm::to_integer(UBStr, UB, 10))
6967 return {0, ThreadLimit};
6968 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6969 if (!llvm::to_integer(LBStr, LB, 10))
6970 return {0, UB};
6971 return {LB, UB};
6972 }
6973
6974 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6975 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6976 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6977 }
6978 return {0, ThreadLimit};
6979}
6980
6981void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6982 Function &Kernel, int32_t LB,
6983 int32_t UB) {
6984 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6985
6986 if (T.isAMDGPU()) {
6987 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6988 llvm::utostr(LB) + "," + llvm::utostr(UB));
6989 return;
6990 }
6991
6992 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6993}
6994
6995std::pair<int32_t, int32_t>
6996OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6997 // TODO: Read from backend annotations if available.
6998 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6999}
7000
7001void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7002 int32_t LB, int32_t UB) {
7003 if (T.isNVPTX())
7004 if (UB > 0)
7005 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7006 if (T.isAMDGPU())
7007 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7008
7009 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7010}
7011
7012void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7013 Function *OutlinedFn) {
7014 if (Config.isTargetDevice()) {
7016 // TODO: Determine if DSO local can be set to true.
7017 OutlinedFn->setDSOLocal(false);
7019 if (T.isAMDGCN())
7021 else if (T.isNVPTX())
7023 else if (T.isSPIRV())
7025 }
7026}
7027
7028Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7029 StringRef EntryFnIDName) {
7030 if (Config.isTargetDevice()) {
7031 assert(OutlinedFn && "The outlined function must exist if embedded");
7032 return OutlinedFn;
7033 }
7034
7035 return new GlobalVariable(
7036 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7037 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7038}
7039
7040Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7041 StringRef EntryFnName) {
7042 if (OutlinedFn)
7043 return OutlinedFn;
7044
7045 assert(!M.getGlobalVariable(EntryFnName, true) &&
7046 "Named kernel already exists?");
7047 return new GlobalVariable(
7048 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7049 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7050}
7051
7052Error OpenMPIRBuilder::emitTargetRegionFunction(
7053 TargetRegionEntryInfo &EntryInfo,
7054 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7055 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7056
7057 SmallString<64> EntryFnName;
7058 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7059
7060 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7061 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7062 if (!CBResult)
7063 return CBResult.takeError();
7064 OutlinedFn = *CBResult;
7065 } else {
7066 OutlinedFn = nullptr;
7067 }
7068
7069 // If this target outline function is not an offload entry, we don't need to
7070 // register it. This may be in the case of a false if clause, or if there are
7071 // no OpenMP targets.
7072 if (!IsOffloadEntry)
7073 return Error::success();
7074
7075 std::string EntryFnIDName =
7076 Config.isTargetDevice()
7077 ? std::string(EntryFnName)
7078 : createPlatformSpecificName({EntryFnName, "region_id"});
7079
7080 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7081 EntryFnName, EntryFnIDName);
7082 return Error::success();
7083}
7084
7085Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7086 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7087 StringRef EntryFnName, StringRef EntryFnIDName) {
7088 if (OutlinedFn)
7089 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7090 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7091 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7092 OffloadInfoManager.registerTargetRegionEntryInfo(
7093 EntryInfo, EntryAddr, OutlinedFnID,
7094 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7095 return OutlinedFnID;
7096}
7097
7098OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7099 const LocationDescription &Loc, InsertPointTy AllocaIP,
7100 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7101 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7102 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7103 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7104 BodyGenTy BodyGenType)>
7105 BodyGenCB,
7106 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7107 if (!updateToLocation(Loc))
7108 return InsertPointTy();
7109
7110 Builder.restoreIP(CodeGenIP);
7111 // Disable TargetData CodeGen on Device pass.
7112 if (Config.IsTargetDevice.value_or(false)) {
7113 if (BodyGenCB) {
7114 InsertPointOrErrorTy AfterIP =
7115 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7116 if (!AfterIP)
7117 return AfterIP.takeError();
7118 Builder.restoreIP(*AfterIP);
7119 }
7120 return Builder.saveIP();
7121 }
7122
7123 bool IsStandAlone = !BodyGenCB;
7124 MapInfosTy *MapInfo;
7125 // Generate the code for the opening of the data environment. Capture all the
7126 // arguments of the runtime call by reference because they are used in the
7127 // closing of the region.
7128 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7129 InsertPointTy CodeGenIP) -> Error {
7130 MapInfo = &GenMapInfoCB(Builder.saveIP());
7131 if (Error Err = emitOffloadingArrays(
7132 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7133 /*IsNonContiguous=*/true, DeviceAddrCB))
7134 return Err;
7135
7136 TargetDataRTArgs RTArgs;
7137 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7138
7139 // Emit the number of elements in the offloading arrays.
7140 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7141
7142 // Source location for the ident struct
7143 if (!SrcLocInfo) {
7144 uint32_t SrcLocStrSize;
7145 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7146 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7147 }
7148
7149 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7150 SrcLocInfo, DeviceID,
7151 PointerNum, RTArgs.BasePointersArray,
7152 RTArgs.PointersArray, RTArgs.SizesArray,
7153 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7154 RTArgs.MappersArray};
7155
7156 if (IsStandAlone) {
7157 assert(MapperFunc && "MapperFunc missing for standalone target data");
7158
7159 auto TaskBodyCB = [&](Value *, Value *,
7161 if (Info.HasNoWait) {
7162 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7166 }
7167
7168 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7169 OffloadingArgs);
7170
7171 if (Info.HasNoWait) {
7172 BasicBlock *OffloadContBlock =
7173 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7174 Function *CurFn = Builder.GetInsertBlock()->getParent();
7175 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7176 Builder.restoreIP(Builder.saveIP());
7177 }
7178 return Error::success();
7179 };
7180
7181 bool RequiresOuterTargetTask = Info.HasNoWait;
7182 if (!RequiresOuterTargetTask)
7183 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7184 /*TargetTaskAllocaIP=*/{}));
7185 else
7186 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7187 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7188 } else {
7189 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7190 omp::OMPRTL___tgt_target_data_begin_mapper);
7191
7192 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7193
7194 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7195 if (isa<AllocaInst>(DeviceMap.second.second)) {
7196 auto *LI =
7197 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7198 Builder.CreateStore(LI, DeviceMap.second.second);
7199 }
7200 }
7201
7202 // If device pointer privatization is required, emit the body of the
7203 // region here. It will have to be duplicated: with and without
7204 // privatization.
7205 InsertPointOrErrorTy AfterIP =
7206 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7207 if (!AfterIP)
7208 return AfterIP.takeError();
7209 Builder.restoreIP(*AfterIP);
7210 }
7211 return Error::success();
7212 };
7213
7214 // If we need device pointer privatization, we need to emit the body of the
7215 // region with no privatization in the 'else' branch of the conditional.
7216 // Otherwise, we don't have to do anything.
7217 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7218 InsertPointTy CodeGenIP) -> Error {
7219 InsertPointOrErrorTy AfterIP =
7220 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7221 if (!AfterIP)
7222 return AfterIP.takeError();
7223 Builder.restoreIP(*AfterIP);
7224 return Error::success();
7225 };
7226
7227 // Generate code for the closing of the data region.
7228 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7229 TargetDataRTArgs RTArgs;
7230 Info.EmitDebug = !MapInfo->Names.empty();
7231 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7232
7233 // Emit the number of elements in the offloading arrays.
7234 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7235
7236 // Source location for the ident struct
7237 if (!SrcLocInfo) {
7238 uint32_t SrcLocStrSize;
7239 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7240 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7241 }
7242
7243 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7244 PointerNum, RTArgs.BasePointersArray,
7245 RTArgs.PointersArray, RTArgs.SizesArray,
7246 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7247 RTArgs.MappersArray};
7248 Function *EndMapperFunc =
7249 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7250
7251 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7252 return Error::success();
7253 };
7254
7255 // We don't have to do anything to close the region if the if clause evaluates
7256 // to false.
7257 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7258 return Error::success();
7259 };
7260
7261 Error Err = [&]() -> Error {
7262 if (BodyGenCB) {
7263 Error Err = [&]() {
7264 if (IfCond)
7265 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7266 return BeginThenGen(AllocaIP, Builder.saveIP());
7267 }();
7268
7269 if (Err)
7270 return Err;
7271
7272 // If we don't require privatization of device pointers, we emit the body
7273 // in between the runtime calls. This avoids duplicating the body code.
7274 InsertPointOrErrorTy AfterIP =
7275 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7276 if (!AfterIP)
7277 return AfterIP.takeError();
7278 restoreIPandDebugLoc(Builder, *AfterIP);
7279
7280 if (IfCond)
7281 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7282 return EndThenGen(AllocaIP, Builder.saveIP());
7283 }
7284 if (IfCond)
7285 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7286 return BeginThenGen(AllocaIP, Builder.saveIP());
7287 }();
7288
7289 if (Err)
7290 return Err;
7291
7292 return Builder.saveIP();
7293}
7294
7296OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7297 bool IsGPUDistribute) {
7298 assert((IVSize == 32 || IVSize == 64) &&
7299 "IV size is not compatible with the omp runtime");
7301 if (IsGPUDistribute)
7302 Name = IVSize == 32
7303 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7304 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7305 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7306 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7307 else
7308 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7309 : omp::OMPRTL___kmpc_for_static_init_4u)
7310 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7311 : omp::OMPRTL___kmpc_for_static_init_8u);
7312
7313 return getOrCreateRuntimeFunction(M, Name);
7314}
7315
7316FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7317 bool IVSigned) {
7318 assert((IVSize == 32 || IVSize == 64) &&
7319 "IV size is not compatible with the omp runtime");
7320 RuntimeFunction Name = IVSize == 32
7321 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7322 : omp::OMPRTL___kmpc_dispatch_init_4u)
7323 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7324 : omp::OMPRTL___kmpc_dispatch_init_8u);
7325
7326 return getOrCreateRuntimeFunction(M, Name);
7327}
7328
7329FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7330 bool IVSigned) {
7331 assert((IVSize == 32 || IVSize == 64) &&
7332 "IV size is not compatible with the omp runtime");
7333 RuntimeFunction Name = IVSize == 32
7334 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7335 : omp::OMPRTL___kmpc_dispatch_next_4u)
7336 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7337 : omp::OMPRTL___kmpc_dispatch_next_8u);
7338
7339 return getOrCreateRuntimeFunction(M, Name);
7340}
7341
7342FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7343 bool IVSigned) {
7344 assert((IVSize == 32 || IVSize == 64) &&
7345 "IV size is not compatible with the omp runtime");
7346 RuntimeFunction Name = IVSize == 32
7347 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7348 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7349 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7350 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7351
7352 return getOrCreateRuntimeFunction(M, Name);
7353}
7354
7355FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7356 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7357}
7358
7360 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7361 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7362
7363 DISubprogram *NewSP = Func->getSubprogram();
7364 if (!NewSP)
7365 return;
7366
7368
7369 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7370 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7371 // Only use cached variable if the arg number matches. This is important
7372 // so that DIVariable created for privatized variables are not discarded.
7373 if (NewVar && (arg == NewVar->getArg()))
7374 return NewVar;
7375
7377 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7378 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7379 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7380 return NewVar;
7381 };
7382
7383 auto UpdateDebugRecord = [&](auto *DR) {
7384 DILocalVariable *OldVar = DR->getVariable();
7385 unsigned ArgNo = 0;
7386 for (auto Loc : DR->location_ops()) {
7387 auto Iter = ValueReplacementMap.find(Loc);
7388 if (Iter != ValueReplacementMap.end()) {
7389 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7390 ArgNo = std::get<1>(Iter->second) + 1;
7391 }
7392 }
7393 if (ArgNo != 0)
7394 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7395 };
7396
7397 // The location and scope of variable intrinsics and records still point to
7398 // the parent function of the target region. Update them.
7399 for (Instruction &I : instructions(Func)) {
7401 "Unexpected debug intrinsic");
7402 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7403 UpdateDebugRecord(&DVR);
7404 }
7405 // An extra argument is passed to the device. Create the debug data for it.
7406 if (OMPBuilder.Config.isTargetDevice()) {
7407 DICompileUnit *CU = NewSP->getUnit();
7408 Module *M = Func->getParent();
7409 DIBuilder DB(*M, true, CU);
7410 DIType *VoidPtrTy =
7411 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7412 DILocalVariable *Var = DB.createParameterVariable(
7413 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7414 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7415 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7416 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7417 &(*Func->begin()));
7418 }
7419}
7420
7422 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7423 return cast<Operator>(V)->getOperand(0);
7424 return V;
7425}
7426
7428 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7429 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7430 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7431 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7432 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7433 SmallVector<Type *> ParameterTypes;
7434 if (OMPBuilder.Config.isTargetDevice()) {
7435 // Add the "implicit" runtime argument we use to provide launch specific
7436 // information for target devices.
7437 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7438 ParameterTypes.push_back(Int8PtrTy);
7439
7440 // All parameters to target devices are passed as pointers
7441 // or i64. This assumes 64-bit address spaces/pointers.
7442 for (auto &Arg : Inputs)
7443 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7444 ? Arg->getType()
7445 : Type::getInt64Ty(Builder.getContext()));
7446 } else {
7447 for (auto &Arg : Inputs)
7448 ParameterTypes.push_back(Arg->getType());
7449 }
7450
7451 auto BB = Builder.GetInsertBlock();
7452 auto M = BB->getModule();
7453 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7454 /*isVarArg*/ false);
7455 auto Func =
7456 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7457
7458 // Forward target-cpu and target-features function attributes from the
7459 // original function to the new outlined function.
7460 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7461
7462 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7463 if (TargetCpuAttr.isStringAttribute())
7464 Func->addFnAttr(TargetCpuAttr);
7465
7466 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7467 if (TargetFeaturesAttr.isStringAttribute())
7468 Func->addFnAttr(TargetFeaturesAttr);
7469
7470 if (OMPBuilder.Config.isTargetDevice()) {
7471 Value *ExecMode =
7472 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7473 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7474 }
7475
7476 // Save insert point.
7477 IRBuilder<>::InsertPointGuard IPG(Builder);
7478 // We will generate the entries in the outlined function but the debug
7479 // location may still be pointing to the parent function. Reset it now.
7480 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7481
7482 // Generate the region into the function.
7483 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7484 Builder.SetInsertPoint(EntryBB);
7485
7486 // Insert target init call in the device compilation pass.
7487 if (OMPBuilder.Config.isTargetDevice())
7488 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7489
7490 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7491
7492 // As we embed the user code in the middle of our target region after we
7493 // generate entry code, we must move what allocas we can into the entry
7494 // block to avoid possible breaking optimisations for device
7495 if (OMPBuilder.Config.isTargetDevice())
7496 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7497
7498 // Insert target deinit call in the device compilation pass.
7499 BasicBlock *OutlinedBodyBB =
7500 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7501 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7502 Builder.saveIP(),
7503 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7504 if (!AfterIP)
7505 return AfterIP.takeError();
7506 Builder.restoreIP(*AfterIP);
7507 if (OMPBuilder.Config.isTargetDevice())
7508 OMPBuilder.createTargetDeinit(Builder);
7509
7510 // Insert return instruction.
7511 Builder.CreateRetVoid();
7512
7513 // New Alloca IP at entry point of created device function.
7514 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7515 auto AllocaIP = Builder.saveIP();
7516
7517 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7518
7519 // Skip the artificial dyn_ptr on the device.
7520 const auto &ArgRange =
7521 OMPBuilder.Config.isTargetDevice()
7522 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7523 : Func->args();
7524
7526
7527 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7528 // Things like GEP's can come in the form of Constants. Constants and
7529 // ConstantExpr's do not have access to the knowledge of what they're
7530 // contained in, so we must dig a little to find an instruction so we
7531 // can tell if they're used inside of the function we're outlining. We
7532 // also replace the original constant expression with a new instruction
7533 // equivalent; an instruction as it allows easy modification in the
7534 // following loop, as we can now know the constant (instruction) is
7535 // owned by our target function and replaceUsesOfWith can now be invoked
7536 // on it (cannot do this with constants it seems). A brand new one also
7537 // allows us to be cautious as it is perhaps possible the old expression
7538 // was used inside of the function but exists and is used externally
7539 // (unlikely by the nature of a Constant, but still).
7540 // NOTE: We cannot remove dead constants that have been rewritten to
7541 // instructions at this stage, we run the risk of breaking later lowering
7542 // by doing so as we could still be in the process of lowering the module
7543 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7544 // constants we have created rewritten versions of.
7545 if (auto *Const = dyn_cast<Constant>(Input))
7546 convertUsersOfConstantsToInstructions(Const, Func, false);
7547
7548 // Collect users before iterating over them to avoid invalidating the
7549 // iteration in case a user uses Input more than once (e.g. a call
7550 // instruction).
7551 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7552 // Collect all the instructions
7554 if (auto *Instr = dyn_cast<Instruction>(User))
7555 if (Instr->getFunction() == Func)
7556 Instr->replaceUsesOfWith(Input, InputCopy);
7557 };
7558
7559 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7560
7561 // Rewrite uses of input valus to parameters.
7562 for (auto InArg : zip(Inputs, ArgRange)) {
7563 Value *Input = std::get<0>(InArg);
7564 Argument &Arg = std::get<1>(InArg);
7565 Value *InputCopy = nullptr;
7566
7567 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7568 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7569 if (!AfterIP)
7570 return AfterIP.takeError();
7571 Builder.restoreIP(*AfterIP);
7572 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7573
7574 // In certain cases a Global may be set up for replacement, however, this
7575 // Global may be used in multiple arguments to the kernel, just segmented
7576 // apart, for example, if we have a global array, that is sectioned into
7577 // multiple mappings (technically not legal in OpenMP, but there is a case
7578 // in Fortran for Common Blocks where this is neccesary), we will end up
7579 // with GEP's into this array inside the kernel, that refer to the Global
7580 // but are technically seperate arguments to the kernel for all intents and
7581 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7582 // index, it will fold into an referal to the Global, if we then encounter
7583 // this folded GEP during replacement all of the references to the
7584 // Global in the kernel will be replaced with the argument we have generated
7585 // that corresponds to it, including any other GEP's that refer to the
7586 // Global that may be other arguments. This will invalidate all of the other
7587 // preceding mapped arguments that refer to the same global that may be
7588 // seperate segments. To prevent this, we defer global processing until all
7589 // other processing has been performed.
7592 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7593 continue;
7594 }
7595
7597 continue;
7598
7599 ReplaceValue(Input, InputCopy, Func);
7600 }
7601
7602 // Replace all of our deferred Input values, currently just Globals.
7603 for (auto Deferred : DeferredReplacement)
7604 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7605
7606 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7607 ValueReplacementMap);
7608 return Func;
7609}
7610/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7611/// of pointers containing shared data between the parent task and the created
7612/// task.
7613static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7614 IRBuilderBase &Builder,
7615 Value *TaskWithPrivates,
7616 Type *TaskWithPrivatesTy) {
7617
7618 Type *TaskTy = OMPIRBuilder.Task;
7619 LLVMContext &Ctx = Builder.getContext();
7620 Value *TaskT =
7621 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7622 Value *Shareds = TaskT;
7623 // TaskWithPrivatesTy can be one of the following
7624 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7625 // %struct.privates }
7626 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7627 //
7628 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7629 // its first member has to be the task descriptor. TaskTy is the type of the
7630 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7631 // first member of TaskT, gives us the pointer to shared data.
7632 if (TaskWithPrivatesTy != TaskTy)
7633 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7634 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7635}
7636/// Create an entry point for a target task with the following.
7637/// It'll have the following signature
7638/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7639/// This function is called from emitTargetTask once the
7640/// code to launch the target kernel has been outlined already.
7641/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7642/// into the task structure so that the deferred target task can access this
7643/// data even after the stack frame of the generating task has been rolled
7644/// back. Offloading arrays contain base pointers, pointers, sizes etc
7645/// of the data that the target kernel will access. These in effect are the
7646/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7648 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7649 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7650 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7651
7652 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7653 // This is because PrivatesTy is the type of the structure in which
7654 // we pass the offloading arrays to the deferred target task.
7655 assert((!NumOffloadingArrays || PrivatesTy) &&
7656 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7657 "to privatize");
7658
7659 Module &M = OMPBuilder.M;
7660 // KernelLaunchFunction is the target launch function, i.e.
7661 // the function that sets up kernel arguments and calls
7662 // __tgt_target_kernel to launch the kernel on the device.
7663 //
7664 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7665
7666 // StaleCI is the CallInst which is the call to the outlined
7667 // target kernel launch function. If there are local live-in values
7668 // that the outlined function uses then these are aggregated into a structure
7669 // which is passed as the second argument. If there are no local live-in
7670 // values or if all values used by the outlined kernel are global variables,
7671 // then there's only one argument, the threadID. So, StaleCI can be
7672 //
7673 // %structArg = alloca { ptr, ptr }, align 8
7674 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7675 // store ptr %20, ptr %gep_, align 8
7676 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7677 // store ptr %21, ptr %gep_8, align 8
7678 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7679 //
7680 // OR
7681 //
7682 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7683 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7684 StaleCI->getIterator());
7685
7686 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7687
7688 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7689 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7690 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7691
7692 auto ProxyFnTy =
7693 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7694 /* isVarArg */ false);
7695 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7696 ".omp_target_task_proxy_func",
7697 Builder.GetInsertBlock()->getModule());
7698 Value *ThreadId = ProxyFn->getArg(0);
7699 Value *TaskWithPrivates = ProxyFn->getArg(1);
7700 ThreadId->setName("thread.id");
7701 TaskWithPrivates->setName("task");
7702
7703 bool HasShareds = SharedArgsOperandNo > 0;
7704 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7705 BasicBlock *EntryBB =
7706 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7707 Builder.SetInsertPoint(EntryBB);
7708
7709 SmallVector<Value *> KernelLaunchArgs;
7710 KernelLaunchArgs.reserve(StaleCI->arg_size());
7711 KernelLaunchArgs.push_back(ThreadId);
7712
7713 if (HasOffloadingArrays) {
7714 assert(TaskTy != TaskWithPrivatesTy &&
7715 "If there are offloading arrays to pass to the target"
7716 "TaskTy cannot be the same as TaskWithPrivatesTy");
7717 (void)TaskTy;
7718 Value *Privates =
7719 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7720 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7721 KernelLaunchArgs.push_back(
7722 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7723 }
7724
7725 if (HasShareds) {
7726 auto *ArgStructAlloca =
7727 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7728 assert(ArgStructAlloca &&
7729 "Unable to find the alloca instruction corresponding to arguments "
7730 "for extracted function");
7731 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7732
7733 AllocaInst *NewArgStructAlloca =
7734 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7735
7736 Value *SharedsSize =
7737 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7738
7740 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7741
7742 Builder.CreateMemCpy(
7743 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7744 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7745 KernelLaunchArgs.push_back(NewArgStructAlloca);
7746 }
7747 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7748 Builder.CreateRetVoid();
7749 return ProxyFn;
7750}
7752
7753 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7754 return GEP->getSourceElementType();
7755 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7756 return Alloca->getAllocatedType();
7757
7758 llvm_unreachable("Unhandled Instruction type");
7759 return nullptr;
7760}
7761// This function returns a struct that has at most two members.
7762// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7763// descriptor. The second member, if needed, is a struct containing arrays
7764// that need to be passed to the offloaded target kernel. For example,
7765// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7766// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7767// respectively, then the types created by this function are
7768//
7769// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7770// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7771// %struct.privates }
7772// %struct.task_with_privates is returned by this function.
7773// If there aren't any offloading arrays to pass to the target kernel,
7774// %struct.kmp_task_ompbuilder_t is returned.
7775static StructType *
7776createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7777 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7778
7779 if (OffloadingArraysToPrivatize.empty())
7780 return OMPIRBuilder.Task;
7781
7782 SmallVector<Type *, 4> StructFieldTypes;
7783 for (Value *V : OffloadingArraysToPrivatize) {
7784 assert(V->getType()->isPointerTy() &&
7785 "Expected pointer to array to privatize. Got a non-pointer value "
7786 "instead");
7787 Type *ArrayTy = getOffloadingArrayType(V);
7788 assert(ArrayTy && "ArrayType cannot be nullptr");
7789 StructFieldTypes.push_back(ArrayTy);
7790 }
7791 StructType *PrivatesStructTy =
7792 StructType::create(StructFieldTypes, "struct.privates");
7793 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7794 "struct.task_with_privates");
7795}
7797 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7798 TargetRegionEntryInfo &EntryInfo,
7799 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7800 Function *&OutlinedFn, Constant *&OutlinedFnID,
7802 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7803 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7804
7805 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7806 [&](StringRef EntryFnName) {
7807 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7808 EntryFnName, Inputs, CBFunc,
7809 ArgAccessorFuncCB);
7810 };
7811
7812 return OMPBuilder.emitTargetRegionFunction(
7813 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7814 OutlinedFnID);
7815}
7816
7817OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7818 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7819 OpenMPIRBuilder::InsertPointTy AllocaIP,
7821 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7822
7823 // The following explains the code-gen scenario for the `target` directive. A
7824 // similar scneario is followed for other device-related directives (e.g.
7825 // `target enter data`) but in similar fashion since we only need to emit task
7826 // that encapsulates the proper runtime call.
7827 //
7828 // When we arrive at this function, the target region itself has been
7829 // outlined into the function OutlinedFn.
7830 // So at ths point, for
7831 // --------------------------------------------------------------
7832 // void user_code_that_offloads(...) {
7833 // omp target depend(..) map(from:a) map(to:b) private(i)
7834 // do i = 1, 10
7835 // a(i) = b(i) + n
7836 // }
7837 //
7838 // --------------------------------------------------------------
7839 //
7840 // we have
7841 //
7842 // --------------------------------------------------------------
7843 //
7844 // void user_code_that_offloads(...) {
7845 // %.offload_baseptrs = alloca [2 x ptr], align 8
7846 // %.offload_ptrs = alloca [2 x ptr], align 8
7847 // %.offload_mappers = alloca [2 x ptr], align 8
7848 // ;; target region has been outlined and now we need to
7849 // ;; offload to it via a target task.
7850 // }
7851 // void outlined_device_function(ptr a, ptr b, ptr n) {
7852 // n = *n_ptr;
7853 // do i = 1, 10
7854 // a(i) = b(i) + n
7855 // }
7856 //
7857 // We have to now do the following
7858 // (i) Make an offloading call to outlined_device_function using the OpenMP
7859 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7860 // emitted by emitKernelLaunch
7861 // (ii) Create a task entry point function that calls kernel_launch_function
7862 // and is the entry point for the target task. See
7863 // '@.omp_target_task_proxy_func in the pseudocode below.
7864 // (iii) Create a task with the task entry point created in (ii)
7865 //
7866 // That is we create the following
7867 // struct task_with_privates {
7868 // struct kmp_task_ompbuilder_t task_struct;
7869 // struct privates {
7870 // [2 x ptr] ; baseptrs
7871 // [2 x ptr] ; ptrs
7872 // [2 x i64] ; sizes
7873 // }
7874 // }
7875 // void user_code_that_offloads(...) {
7876 // %.offload_baseptrs = alloca [2 x ptr], align 8
7877 // %.offload_ptrs = alloca [2 x ptr], align 8
7878 // %.offload_sizes = alloca [2 x i64], align 8
7879 //
7880 // %structArg = alloca { ptr, ptr, ptr }, align 8
7881 // %strucArg[0] = a
7882 // %strucArg[1] = b
7883 // %strucArg[2] = &n
7884 //
7885 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7886 // sizeof(kmp_task_ompbuilder_t),
7887 // sizeof(structArg),
7888 // @.omp_target_task_proxy_func,
7889 // ...)
7890 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7891 // sizeof(structArg))
7892 // memcpy(target_task_with_privates->privates->baseptrs,
7893 // offload_baseptrs, sizeof(offload_baseptrs)
7894 // memcpy(target_task_with_privates->privates->ptrs,
7895 // offload_ptrs, sizeof(offload_ptrs)
7896 // memcpy(target_task_with_privates->privates->sizes,
7897 // offload_sizes, sizeof(offload_sizes)
7898 // dependencies_array = ...
7899 // ;; if nowait not present
7900 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7901 // call @__kmpc_omp_task_begin_if0(...)
7902 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7903 // %target_task_with_privates)
7904 // call @__kmpc_omp_task_complete_if0(...)
7905 // }
7906 //
7907 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7908 // ptr %task) {
7909 // %structArg = alloca {ptr, ptr, ptr}
7910 // %task_ptr = getelementptr(%task, 0, 0)
7911 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7912 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7913 //
7914 // %offloading_arrays = getelementptr(%task, 0, 1)
7915 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7916 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7917 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7918 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7919 // %offload_sizes, %structArg)
7920 // }
7921 //
7922 // We need the proxy function because the signature of the task entry point
7923 // expected by kmpc_omp_task is always the same and will be different from
7924 // that of the kernel_launch function.
7925 //
7926 // kernel_launch_function is generated by emitKernelLaunch and has the
7927 // always_inline attribute. For this example, it'll look like so:
7928 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7929 // %offload_sizes, %structArg) alwaysinline {
7930 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7931 // ; load aggregated data from %structArg
7932 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7933 // ; offload_sizes
7934 // call i32 @__tgt_target_kernel(...,
7935 // outlined_device_function,
7936 // ptr %kernel_args)
7937 // }
7938 // void outlined_device_function(ptr a, ptr b, ptr n) {
7939 // n = *n_ptr;
7940 // do i = 1, 10
7941 // a(i) = b(i) + n
7942 // }
7943 //
7944 BasicBlock *TargetTaskBodyBB =
7945 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7946 BasicBlock *TargetTaskAllocaBB =
7947 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7948
7949 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7950 TargetTaskAllocaBB->begin());
7951 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7952
7953 OutlineInfo OI;
7954 OI.EntryBB = TargetTaskAllocaBB;
7955 OI.OuterAllocaBB = AllocaIP.getBlock();
7956
7957 // Add the thread ID argument.
7959 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7960 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7961
7962 // Generate the task body which will subsequently be outlined.
7963 Builder.restoreIP(TargetTaskBodyIP);
7964 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7965 return Err;
7966
7967 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7968 // it is given. These blocks are enumerated by
7969 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7970 // to be outside the region. In other words, OI.ExitBlock is expected to be
7971 // the start of the region after the outlining. We used to set OI.ExitBlock
7972 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7973 // except when the task body is a single basic block. In that case,
7974 // OI.ExitBlock is set to the single task body block and will get left out of
7975 // the outlining process. So, simply create a new empty block to which we
7976 // uncoditionally branch from where TaskBodyCB left off
7977 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7978 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7979 /*IsFinished=*/true);
7980
7981 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7982 bool NeedsTargetTask = HasNoWait && DeviceID;
7983 if (NeedsTargetTask) {
7984 for (auto *V :
7985 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7986 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7987 RTArgs.SizesArray}) {
7989 OffloadingArraysToPrivatize.push_back(V);
7990 OI.ExcludeArgsFromAggregate.push_back(V);
7991 }
7992 }
7993 }
7994 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7995 DeviceID, OffloadingArraysToPrivatize](
7996 Function &OutlinedFn) mutable {
7997 assert(OutlinedFn.hasOneUse() &&
7998 "there must be a single user for the outlined function");
7999
8000 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8001
8002 // The first argument of StaleCI is always the thread id.
8003 // The next few arguments are the pointers to offloading arrays
8004 // if any. (see OffloadingArraysToPrivatize)
8005 // Finally, all other local values that are live-in into the outlined region
8006 // end up in a structure whose pointer is passed as the last argument. This
8007 // piece of data is passed in the "shared" field of the task structure. So,
8008 // we know we have to pass shareds to the task if the number of arguments is
8009 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8010 // thread id. Further, for safety, we assert that the number of arguments of
8011 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8012 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8013 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8014 assert((!HasShareds ||
8015 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8016 "Wrong number of arguments for StaleCI when shareds are present");
8017 int SharedArgOperandNo =
8018 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8019
8020 StructType *TaskWithPrivatesTy =
8021 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8022 StructType *PrivatesTy = nullptr;
8023
8024 if (!OffloadingArraysToPrivatize.empty())
8025 PrivatesTy =
8026 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8027
8029 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8030 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8031
8032 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8033 << "\n");
8034
8035 Builder.SetInsertPoint(StaleCI);
8036
8037 // Gather the arguments for emitting the runtime call.
8038 uint32_t SrcLocStrSize;
8039 Constant *SrcLocStr =
8040 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8041 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8042
8043 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8044 //
8045 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8046 // the DeviceID to the deferred task and also since
8047 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8048 Function *TaskAllocFn =
8049 !NeedsTargetTask
8050 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8051 : getOrCreateRuntimeFunctionPtr(
8052 OMPRTL___kmpc_omp_target_task_alloc);
8053
8054 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8055 // call.
8056 Value *ThreadID = getOrCreateThreadID(Ident);
8057
8058 // Argument - `sizeof_kmp_task_t` (TaskSize)
8059 // Tasksize refers to the size in bytes of kmp_task_t data structure
8060 // plus any other data to be passed to the target task, if any, which
8061 // is packed into a struct. kmp_task_t and the struct so created are
8062 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8063 Value *TaskSize = Builder.getInt64(
8064 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8065
8066 // Argument - `sizeof_shareds` (SharedsSize)
8067 // SharedsSize refers to the shareds array size in the kmp_task_t data
8068 // structure.
8069 Value *SharedsSize = Builder.getInt64(0);
8070 if (HasShareds) {
8071 auto *ArgStructAlloca =
8072 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8073 assert(ArgStructAlloca &&
8074 "Unable to find the alloca instruction corresponding to arguments "
8075 "for extracted function");
8076 auto *ArgStructType =
8077 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8078 assert(ArgStructType && "Unable to find struct type corresponding to "
8079 "arguments for extracted function");
8080 SharedsSize =
8081 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8082 }
8083
8084 // Argument - `flags`
8085 // Task is tied iff (Flags & 1) == 1.
8086 // Task is untied iff (Flags & 1) == 0.
8087 // Task is final iff (Flags & 2) == 2.
8088 // Task is not final iff (Flags & 2) == 0.
8089 // A target task is not final and is untied.
8090 Value *Flags = Builder.getInt32(0);
8091
8092 // Emit the @__kmpc_omp_task_alloc runtime call
8093 // The runtime call returns a pointer to an area where the task captured
8094 // variables must be copied before the task is run (TaskData)
8095 CallInst *TaskData = nullptr;
8096
8097 SmallVector<llvm::Value *> TaskAllocArgs = {
8098 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8099 /*flags=*/Flags,
8100 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8101 /*task_func=*/ProxyFn};
8102
8103 if (NeedsTargetTask) {
8104 assert(DeviceID && "Expected non-empty device ID.");
8105 TaskAllocArgs.push_back(DeviceID);
8106 }
8107
8108 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8109
8110 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8111 if (HasShareds) {
8112 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8114 *this, Builder, TaskData, TaskWithPrivatesTy);
8115 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8116 SharedsSize);
8117 }
8118 if (!OffloadingArraysToPrivatize.empty()) {
8119 Value *Privates =
8120 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8121 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8122 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8123 [[maybe_unused]] Type *ArrayType =
8124 getOffloadingArrayType(PtrToPrivatize);
8125 assert(ArrayType && "ArrayType cannot be nullptr");
8126
8127 Type *ElementType = PrivatesTy->getElementType(i);
8128 assert(ElementType == ArrayType &&
8129 "ElementType should match ArrayType");
8130 (void)ArrayType;
8131
8132 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8133 Builder.CreateMemCpy(
8134 Dst, Alignment, PtrToPrivatize, Alignment,
8135 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8136 }
8137 }
8138
8139 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8140
8141 // ---------------------------------------------------------------
8142 // V5.2 13.8 target construct
8143 // If the nowait clause is present, execution of the target task
8144 // may be deferred. If the nowait clause is not present, the target task is
8145 // an included task.
8146 // ---------------------------------------------------------------
8147 // The above means that the lack of a nowait on the target construct
8148 // translates to '#pragma omp task if(0)'
8149 if (!NeedsTargetTask) {
8150 if (DepArray) {
8151 Function *TaskWaitFn =
8152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8153 Builder.CreateCall(
8154 TaskWaitFn,
8155 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8156 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8157 /*dep_list=*/DepArray,
8158 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8159 /*noalias_dep_list=*/
8161 }
8162 // Included task.
8163 Function *TaskBeginFn =
8164 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8165 Function *TaskCompleteFn =
8166 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8167 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8168 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8169 CI->setDebugLoc(StaleCI->getDebugLoc());
8170 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8171 } else if (DepArray) {
8172 // HasNoWait - meaning the task may be deferred. Call
8173 // __kmpc_omp_task_with_deps if there are dependencies,
8174 // else call __kmpc_omp_task
8175 Function *TaskFn =
8176 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8177 Builder.CreateCall(
8178 TaskFn,
8179 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8180 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8182 } else {
8183 // Emit the @__kmpc_omp_task runtime call to spawn the task
8184 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8185 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8186 }
8187
8188 StaleCI->eraseFromParent();
8189 for (Instruction *I : llvm::reverse(ToBeDeleted))
8190 I->eraseFromParent();
8191 };
8192 addOutlineInfo(std::move(OI));
8193
8194 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8195 << *(Builder.GetInsertBlock()) << "\n");
8196 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8197 << *(Builder.GetInsertBlock()->getParent()->getParent())
8198 << "\n");
8199 return Builder.saveIP();
8200}
8201
8202Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8203 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8204 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8205 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8206 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8207 if (Error Err =
8208 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8209 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8210 return Err;
8211 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8212 return Error::success();
8213}
8214
8215static void emitTargetCall(
8216 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8217 OpenMPIRBuilder::InsertPointTy AllocaIP,
8218 OpenMPIRBuilder::TargetDataInfo &Info,
8219 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8220 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8221 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8223 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8224 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8226 bool HasNoWait) {
8227 // Generate a function call to the host fallback implementation of the target
8228 // region. This is called by the host when no offload entry was generated for
8229 // the target region and when the offloading call fails at runtime.
8230 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8231 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8232 Builder.restoreIP(IP);
8233 Builder.CreateCall(OutlinedFn, Args);
8234 return Builder.saveIP();
8235 };
8236
8237 bool HasDependencies = Dependencies.size() > 0;
8238 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8239
8240 OpenMPIRBuilder::TargetKernelArgs KArgs;
8241
8242 auto TaskBodyCB =
8243 [&](Value *DeviceID, Value *RTLoc,
8244 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8245 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8246 // produce any.
8247 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8248 // emitKernelLaunch makes the necessary runtime call to offload the
8249 // kernel. We then outline all that code into a separate function
8250 // ('kernel_launch_function' in the pseudo code above). This function is
8251 // then called by the target task proxy function (see
8252 // '@.omp_target_task_proxy_func' in the pseudo code above)
8253 // "@.omp_target_task_proxy_func' is generated by
8254 // emitTargetTaskProxyFunction.
8255 if (OutlinedFnID && DeviceID)
8256 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8257 EmitTargetCallFallbackCB, KArgs,
8258 DeviceID, RTLoc, TargetTaskAllocaIP);
8259
8260 // We only need to do the outlining if `DeviceID` is set to avoid calling
8261 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8262 // generating the `else` branch of an `if` clause.
8263 //
8264 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8265 // In this case, we execute the host implementation directly.
8266 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8267 }());
8268
8269 OMPBuilder.Builder.restoreIP(AfterIP);
8270 return Error::success();
8271 };
8272
8273 auto &&EmitTargetCallElse =
8274 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8275 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8276 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8277 // produce any.
8278 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8279 if (RequiresOuterTargetTask) {
8280 // Arguments that are intended to be directly forwarded to an
8281 // emitKernelLaunch call are pased as nullptr, since
8282 // OutlinedFnID=nullptr results in that call not being done.
8283 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8284 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8285 /*RTLoc=*/nullptr, AllocaIP,
8286 Dependencies, EmptyRTArgs, HasNoWait);
8287 }
8288 return EmitTargetCallFallbackCB(Builder.saveIP());
8289 }());
8290
8291 Builder.restoreIP(AfterIP);
8292 return Error::success();
8293 };
8294
8295 auto &&EmitTargetCallThen =
8296 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8297 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8298 Info.HasNoWait = HasNoWait;
8299 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8300 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8301 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8302 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8303 /*IsNonContiguous=*/true,
8304 /*ForEndCall=*/false))
8305 return Err;
8306
8307 SmallVector<Value *, 3> NumTeamsC;
8308 for (auto [DefaultVal, RuntimeVal] :
8309 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8310 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8311 : Builder.getInt32(DefaultVal));
8312
8313 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8314 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8315 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8316 if (Clause)
8317 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8318 /*isSigned=*/false);
8319 return Clause;
8320 };
8321 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8322 if (Clause)
8323 Result =
8324 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8325 Result, Clause)
8326 : Clause;
8327 };
8328
8329 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8330 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8331 SmallVector<Value *, 3> NumThreadsC;
8332 Value *MaxThreadsClause =
8333 RuntimeAttrs.TeamsThreadLimit.size() == 1
8334 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8335 : nullptr;
8336
8337 for (auto [TeamsVal, TargetVal] : zip_equal(
8338 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8339 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8340 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8341
8342 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8343 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8344
8345 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8346 }
8347
8348 unsigned NumTargetItems = Info.NumberOfPtrs;
8349 // TODO: Use correct device ID
8350 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8351 uint32_t SrcLocStrSize;
8352 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8353 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8354 llvm::omp::IdentFlag(0), 0);
8355
8356 Value *TripCount = RuntimeAttrs.LoopTripCount
8357 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8358 Builder.getInt64Ty(),
8359 /*isSigned=*/false)
8360 : Builder.getInt64(0);
8361
8362 // TODO: Use correct DynCGGroupMem
8363 Value *DynCGGroupMem = Builder.getInt32(0);
8364
8365 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8366 NumTeamsC, NumThreadsC,
8367 DynCGGroupMem, HasNoWait);
8368
8369 // Assume no error was returned because TaskBodyCB and
8370 // EmitTargetCallFallbackCB don't produce any.
8371 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8372 // The presence of certain clauses on the target directive require the
8373 // explicit generation of the target task.
8374 if (RequiresOuterTargetTask)
8375 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8376 Dependencies, KArgs.RTArgs,
8377 Info.HasNoWait);
8378
8379 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8380 EmitTargetCallFallbackCB, KArgs,
8381 DeviceID, RTLoc, AllocaIP);
8382 }());
8383
8384 Builder.restoreIP(AfterIP);
8385 return Error::success();
8386 };
8387
8388 // If we don't have an ID for the target region, it means an offload entry
8389 // wasn't created. In this case we just run the host fallback directly and
8390 // ignore any potential 'if' clauses.
8391 if (!OutlinedFnID) {
8392 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8393 return;
8394 }
8395
8396 // If there's no 'if' clause, only generate the kernel launch code path.
8397 if (!IfCond) {
8398 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8399 return;
8400 }
8401
8402 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8403 EmitTargetCallElse, AllocaIP));
8404}
8405
8406OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8407 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8408 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8409 TargetRegionEntryInfo &EntryInfo,
8410 const TargetKernelDefaultAttrs &DefaultAttrs,
8411 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8412 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8413 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8414 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8415 CustomMapperCallbackTy CustomMapperCB,
8416 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8417
8418 if (!updateToLocation(Loc))
8419 return InsertPointTy();
8420
8421 Builder.restoreIP(CodeGenIP);
8422
8423 Function *OutlinedFn;
8424 Constant *OutlinedFnID = nullptr;
8425 // The target region is outlined into its own function. The LLVM IR for
8426 // the target region itself is generated using the callbacks CBFunc
8427 // and ArgAccessorFuncCB
8429 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8430 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8431 return Err;
8432
8433 // If we are not on the target device, then we need to generate code
8434 // to make a remote call (offload) to the previously outlined function
8435 // that represents the target region. Do that now.
8436 if (!Config.isTargetDevice())
8437 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8438 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8439 CustomMapperCB, Dependencies, HasNowait);
8440 return Builder.saveIP();
8441}
8442
8443std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8444 StringRef FirstSeparator,
8445 StringRef Separator) {
8446 SmallString<128> Buffer;
8447 llvm::raw_svector_ostream OS(Buffer);
8448 StringRef Sep = FirstSeparator;
8449 for (StringRef Part : Parts) {
8450 OS << Sep << Part;
8451 Sep = Separator;
8452 }
8453 return OS.str().str();
8454}
8455
8456std::string
8457OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8458 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8459 Config.separator());
8460}
8461
8463OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8464 unsigned AddressSpace) {
8465 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8466 if (Elem.second) {
8467 assert(Elem.second->getValueType() == Ty &&
8468 "OMP internal variable has different type than requested");
8469 } else {
8470 // TODO: investigate the appropriate linkage type used for the global
8471 // variable for possibly changing that to internal or private, or maybe
8472 // create different versions of the function for different OMP internal
8473 // variables.
8474 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8477 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8478 Constant::getNullValue(Ty), Elem.first(),
8479 /*InsertBefore=*/nullptr,
8481 const DataLayout &DL = M.getDataLayout();
8482 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8483 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8484 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8485 Elem.second = GV;
8486 }
8487
8488 return Elem.second;
8489}
8490
8491Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8492 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8493 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8494 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8495}
8496
8497Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8498 LLVMContext &Ctx = Builder.getContext();
8499 Value *Null =
8501 Value *SizeGep =
8502 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8503 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8504 return SizePtrToInt;
8505}
8506
8508OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8509 std::string VarName) {
8510 llvm::Constant *MaptypesArrayInit =
8511 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8512 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8513 M, MaptypesArrayInit->getType(),
8514 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8515 VarName);
8516 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8517 return MaptypesArrayGlobal;
8518}
8519
8520void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8521 InsertPointTy AllocaIP,
8522 unsigned NumOperands,
8523 struct MapperAllocas &MapperAllocas) {
8524 if (!updateToLocation(Loc))
8525 return;
8526
8527 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8528 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8529 Builder.restoreIP(AllocaIP);
8530 AllocaInst *ArgsBase = Builder.CreateAlloca(
8531 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8532 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8533 ".offload_ptrs");
8534 AllocaInst *ArgSizes = Builder.CreateAlloca(
8535 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8536 updateToLocation(Loc);
8537 MapperAllocas.ArgsBase = ArgsBase;
8538 MapperAllocas.Args = Args;
8539 MapperAllocas.ArgSizes = ArgSizes;
8540}
8541
8542void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8543 Function *MapperFunc, Value *SrcLocInfo,
8544 Value *MaptypesArg, Value *MapnamesArg,
8545 struct MapperAllocas &MapperAllocas,
8546 int64_t DeviceID, unsigned NumOperands) {
8547 if (!updateToLocation(Loc))
8548 return;
8549
8550 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8551 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8552 Value *ArgsBaseGEP =
8553 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8554 {Builder.getInt32(0), Builder.getInt32(0)});
8555 Value *ArgsGEP =
8556 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8557 {Builder.getInt32(0), Builder.getInt32(0)});
8558 Value *ArgSizesGEP =
8559 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8560 {Builder.getInt32(0), Builder.getInt32(0)});
8561 Value *NullPtr =
8562 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8563 Builder.CreateCall(MapperFunc,
8564 {SrcLocInfo, Builder.getInt64(DeviceID),
8565 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8566 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8567}
8568
8569void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8570 TargetDataRTArgs &RTArgs,
8571 TargetDataInfo &Info,
8572 bool ForEndCall) {
8573 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8574 "expected region end call to runtime only when end call is separate");
8575 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8576 auto VoidPtrTy = UnqualPtrTy;
8577 auto VoidPtrPtrTy = UnqualPtrTy;
8578 auto Int64Ty = Type::getInt64Ty(M.getContext());
8579 auto Int64PtrTy = UnqualPtrTy;
8580
8581 if (!Info.NumberOfPtrs) {
8582 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8583 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8584 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8585 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8586 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8587 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8588 return;
8589 }
8590
8591 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8592 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8593 Info.RTArgs.BasePointersArray,
8594 /*Idx0=*/0, /*Idx1=*/0);
8595 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8596 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8597 /*Idx0=*/0,
8598 /*Idx1=*/0);
8599 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8600 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8601 /*Idx0=*/0, /*Idx1=*/0);
8602 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8603 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8604 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8605 : Info.RTArgs.MapTypesArray,
8606 /*Idx0=*/0,
8607 /*Idx1=*/0);
8608
8609 // Only emit the mapper information arrays if debug information is
8610 // requested.
8611 if (!Info.EmitDebug)
8612 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8613 else
8614 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8615 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8616 /*Idx0=*/0,
8617 /*Idx1=*/0);
8618 // If there is no user-defined mapper, set the mapper array to nullptr to
8619 // avoid an unnecessary data privatization
8620 if (!Info.HasMapper)
8621 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8622 else
8623 RTArgs.MappersArray =
8624 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8625}
8626
8627void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8628 InsertPointTy CodeGenIP,
8629 MapInfosTy &CombinedInfo,
8630 TargetDataInfo &Info) {
8631 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8632 CombinedInfo.NonContigInfo;
8633
8634 // Build an array of struct descriptor_dim and then assign it to
8635 // offload_args.
8636 //
8637 // struct descriptor_dim {
8638 // uint64_t offset;
8639 // uint64_t count;
8640 // uint64_t stride
8641 // };
8642 Type *Int64Ty = Builder.getInt64Ty();
8644 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8645 "struct.descriptor_dim");
8646
8647 enum { OffsetFD = 0, CountFD, StrideFD };
8648 // We need two index variable here since the size of "Dims" is the same as
8649 // the size of Components, however, the size of offset, count, and stride is
8650 // equal to the size of base declaration that is non-contiguous.
8651 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8652 // Skip emitting ir if dimension size is 1 since it cannot be
8653 // non-contiguous.
8654 if (NonContigInfo.Dims[I] == 1)
8655 continue;
8656 Builder.restoreIP(AllocaIP);
8657 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8658 AllocaInst *DimsAddr =
8659 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8660 Builder.restoreIP(CodeGenIP);
8661 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8662 unsigned RevIdx = EE - II - 1;
8663 Value *DimsLVal = Builder.CreateInBoundsGEP(
8664 DimsAddr->getAllocatedType(), DimsAddr,
8665 {Builder.getInt64(0), Builder.getInt64(II)});
8666 // Offset
8667 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8668 Builder.CreateAlignedStore(
8669 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8670 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8671 // Count
8672 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8673 Builder.CreateAlignedStore(
8674 NonContigInfo.Counts[L][RevIdx], CountLVal,
8675 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8676 // Stride
8677 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8678 Builder.CreateAlignedStore(
8679 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8680 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8681 }
8682 // args[I] = &dims
8683 Builder.restoreIP(CodeGenIP);
8684 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8685 DimsAddr, Builder.getPtrTy());
8686 Value *P = Builder.CreateConstInBoundsGEP2_32(
8687 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8688 Info.RTArgs.PointersArray, 0, I);
8689 Builder.CreateAlignedStore(
8690 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8691 ++L;
8692 }
8693}
8694
8695void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8696 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8697 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8698 BasicBlock *ExitBB, bool IsInit) {
8699 StringRef Prefix = IsInit ? ".init" : ".del";
8700
8701 // Evaluate if this is an array section.
8703 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8704 Value *IsArray =
8705 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8706 Value *DeleteBit = Builder.CreateAnd(
8707 MapType,
8708 Builder.getInt64(
8709 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8710 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8711 Value *DeleteCond;
8712 Value *Cond;
8713 if (IsInit) {
8714 // base != begin?
8715 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8716 // IsPtrAndObj?
8717 Value *PtrAndObjBit = Builder.CreateAnd(
8718 MapType,
8719 Builder.getInt64(
8720 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8721 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8722 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8723 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8724 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8725 DeleteCond = Builder.CreateIsNull(
8726 DeleteBit,
8727 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8728 } else {
8729 Cond = IsArray;
8730 DeleteCond = Builder.CreateIsNotNull(
8731 DeleteBit,
8732 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8733 }
8734 Cond = Builder.CreateAnd(Cond, DeleteCond);
8735 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8736
8737 emitBlock(BodyBB, MapperFn);
8738 // Get the array size by multiplying element size and element number (i.e., \p
8739 // Size).
8740 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8741 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8742 // memory allocation/deletion purpose only.
8743 Value *MapTypeArg = Builder.CreateAnd(
8744 MapType,
8745 Builder.getInt64(
8746 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8747 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8748 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8749 MapTypeArg = Builder.CreateOr(
8750 MapTypeArg,
8751 Builder.getInt64(
8752 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8753 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8754
8755 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8756 // data structure.
8757 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8758 ArraySize, MapTypeArg, MapName};
8759 Builder.CreateCall(
8760 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8761 OffloadingArgs);
8762}
8763
8764Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8765 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8766 llvm::Value *BeginArg)>
8767 GenMapInfoCB,
8768 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8769 SmallVector<Type *> Params;
8770 Params.emplace_back(Builder.getPtrTy());
8771 Params.emplace_back(Builder.getPtrTy());
8772 Params.emplace_back(Builder.getPtrTy());
8773 Params.emplace_back(Builder.getInt64Ty());
8774 Params.emplace_back(Builder.getInt64Ty());
8775 Params.emplace_back(Builder.getPtrTy());
8776
8777 auto *FnTy =
8778 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8779
8780 SmallString<64> TyStr;
8781 raw_svector_ostream Out(TyStr);
8782 Function *MapperFn =
8784 MapperFn->addFnAttr(Attribute::NoInline);
8785 MapperFn->addFnAttr(Attribute::NoUnwind);
8786 MapperFn->addParamAttr(0, Attribute::NoUndef);
8787 MapperFn->addParamAttr(1, Attribute::NoUndef);
8788 MapperFn->addParamAttr(2, Attribute::NoUndef);
8789 MapperFn->addParamAttr(3, Attribute::NoUndef);
8790 MapperFn->addParamAttr(4, Attribute::NoUndef);
8791 MapperFn->addParamAttr(5, Attribute::NoUndef);
8792
8793 // Start the mapper function code generation.
8794 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8795 auto SavedIP = Builder.saveIP();
8796 Builder.SetInsertPoint(EntryBB);
8797
8798 Value *MapperHandle = MapperFn->getArg(0);
8799 Value *BaseIn = MapperFn->getArg(1);
8800 Value *BeginIn = MapperFn->getArg(2);
8801 Value *Size = MapperFn->getArg(3);
8802 Value *MapType = MapperFn->getArg(4);
8803 Value *MapName = MapperFn->getArg(5);
8804
8805 // Compute the starting and end addresses of array elements.
8806 // Prepare common arguments for array initiation and deletion.
8807 // Convert the size in bytes into the number of array elements.
8808 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8809 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8810 Value *PtrBegin = BeginIn;
8811 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8812
8813 // Emit array initiation if this is an array section and \p MapType indicates
8814 // that memory allocation is required.
8815 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8816 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8817 MapType, MapName, ElementSize, HeadBB,
8818 /*IsInit=*/true);
8819
8820 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8821
8822 // Emit the loop header block.
8823 emitBlock(HeadBB, MapperFn);
8824 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8825 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8826 // Evaluate whether the initial condition is satisfied.
8827 Value *IsEmpty =
8828 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8829 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8830
8831 // Emit the loop body block.
8832 emitBlock(BodyBB, MapperFn);
8833 BasicBlock *LastBB = BodyBB;
8834 PHINode *PtrPHI =
8835 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8836 PtrPHI->addIncoming(PtrBegin, HeadBB);
8837
8838 // Get map clause information. Fill up the arrays with all mapped variables.
8839 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8840 if (!Info)
8841 return Info.takeError();
8842
8843 // Call the runtime API __tgt_mapper_num_components to get the number of
8844 // pre-existing components.
8845 Value *OffloadingArgs[] = {MapperHandle};
8846 Value *PreviousSize = Builder.CreateCall(
8847 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8848 OffloadingArgs);
8849 Value *ShiftedPreviousSize =
8850 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8851
8852 // Fill up the runtime mapper handle for all components.
8853 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8854 Value *CurBaseArg = Info->BasePointers[I];
8855 Value *CurBeginArg = Info->Pointers[I];
8856 Value *CurSizeArg = Info->Sizes[I];
8857 Value *CurNameArg = Info->Names.size()
8858 ? Info->Names[I]
8859 : Constant::getNullValue(Builder.getPtrTy());
8860
8861 // Extract the MEMBER_OF field from the map type.
8862 Value *OriMapType = Builder.getInt64(
8863 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8864 Info->Types[I]));
8865 Value *MemberMapType =
8866 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8867
8868 // Combine the map type inherited from user-defined mapper with that
8869 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8870 // bits of the \a MapType, which is the input argument of the mapper
8871 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8872 // bits of MemberMapType.
8873 // [OpenMP 5.0], 1.2.6. map-type decay.
8874 // | alloc | to | from | tofrom | release | delete
8875 // ----------------------------------------------------------
8876 // alloc | alloc | alloc | alloc | alloc | release | delete
8877 // to | alloc | to | alloc | to | release | delete
8878 // from | alloc | alloc | from | from | release | delete
8879 // tofrom | alloc | to | from | tofrom | release | delete
8880 Value *LeftToFrom = Builder.CreateAnd(
8881 MapType,
8882 Builder.getInt64(
8883 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8884 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8885 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8886 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8887 BasicBlock *AllocElseBB =
8888 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8889 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8890 BasicBlock *ToElseBB =
8891 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8892 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8893 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8894 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8895 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8896 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8897 emitBlock(AllocBB, MapperFn);
8898 Value *AllocMapType = Builder.CreateAnd(
8899 MemberMapType,
8900 Builder.getInt64(
8901 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8902 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8903 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8904 Builder.CreateBr(EndBB);
8905 emitBlock(AllocElseBB, MapperFn);
8906 Value *IsTo = Builder.CreateICmpEQ(
8907 LeftToFrom,
8908 Builder.getInt64(
8909 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8910 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8911 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8912 // In case of to, clear OMP_MAP_FROM.
8913 emitBlock(ToBB, MapperFn);
8914 Value *ToMapType = Builder.CreateAnd(
8915 MemberMapType,
8916 Builder.getInt64(
8917 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8918 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8919 Builder.CreateBr(EndBB);
8920 emitBlock(ToElseBB, MapperFn);
8921 Value *IsFrom = Builder.CreateICmpEQ(
8922 LeftToFrom,
8923 Builder.getInt64(
8924 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8925 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8926 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8927 // In case of from, clear OMP_MAP_TO.
8928 emitBlock(FromBB, MapperFn);
8929 Value *FromMapType = Builder.CreateAnd(
8930 MemberMapType,
8931 Builder.getInt64(
8932 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8933 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8934 // In case of tofrom, do nothing.
8935 emitBlock(EndBB, MapperFn);
8936 LastBB = EndBB;
8937 PHINode *CurMapType =
8938 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8939 CurMapType->addIncoming(AllocMapType, AllocBB);
8940 CurMapType->addIncoming(ToMapType, ToBB);
8941 CurMapType->addIncoming(FromMapType, FromBB);
8942 CurMapType->addIncoming(MemberMapType, ToElseBB);
8943
8944 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8945 CurSizeArg, CurMapType, CurNameArg};
8946
8947 auto ChildMapperFn = CustomMapperCB(I);
8948 if (!ChildMapperFn)
8949 return ChildMapperFn.takeError();
8950 if (*ChildMapperFn) {
8951 // Call the corresponding mapper function.
8952 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8953 } else {
8954 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8955 // data structure.
8956 Builder.CreateCall(
8957 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8958 OffloadingArgs);
8959 }
8960 }
8961
8962 // Update the pointer to point to the next element that needs to be mapped,
8963 // and check whether we have mapped all elements.
8964 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8965 "omp.arraymap.next");
8966 PtrPHI->addIncoming(PtrNext, LastBB);
8967 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8968 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8969 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8970
8971 emitBlock(ExitBB, MapperFn);
8972 // Emit array deletion if this is an array section and \p MapType indicates
8973 // that deletion is required.
8974 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8975 MapType, MapName, ElementSize, DoneBB,
8976 /*IsInit=*/false);
8977
8978 // Emit the function exit block.
8979 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8980
8981 Builder.CreateRetVoid();
8982 Builder.restoreIP(SavedIP);
8983 return MapperFn;
8984}
8985
8986Error OpenMPIRBuilder::emitOffloadingArrays(
8987 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8988 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8989 bool IsNonContiguous,
8990 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8991
8992 // Reset the array information.
8993 Info.clearArrayInfo();
8994 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8995
8996 if (Info.NumberOfPtrs == 0)
8997 return Error::success();
8998
8999 Builder.restoreIP(AllocaIP);
9000 // Detect if we have any capture size requiring runtime evaluation of the
9001 // size so that a constant array could be eventually used.
9002 ArrayType *PointerArrayType =
9003 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9004
9005 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9006 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9007
9008 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9009 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9010 AllocaInst *MappersArray = Builder.CreateAlloca(
9011 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9012 Info.RTArgs.MappersArray = MappersArray;
9013
9014 // If we don't have any VLA types or other types that require runtime
9015 // evaluation, we can use a constant array for the map sizes, otherwise we
9016 // need to fill up the arrays as we do for the pointers.
9017 Type *Int64Ty = Builder.getInt64Ty();
9018 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9019 ConstantInt::get(Int64Ty, 0));
9020 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9021 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9022 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9023 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9024 if (IsNonContiguous &&
9025 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9026 CombinedInfo.Types[I] &
9027 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9028 ConstSizes[I] =
9029 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9030 else
9031 ConstSizes[I] = CI;
9032 continue;
9033 }
9034 }
9035 RuntimeSizes.set(I);
9036 }
9037
9038 if (RuntimeSizes.all()) {
9039 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9040 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9041 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9042 restoreIPandDebugLoc(Builder, CodeGenIP);
9043 } else {
9044 auto *SizesArrayInit = ConstantArray::get(
9045 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9046 std::string Name = createPlatformSpecificName({"offload_sizes"});
9047 auto *SizesArrayGbl =
9048 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9049 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9050 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9051
9052 if (!RuntimeSizes.any()) {
9053 Info.RTArgs.SizesArray = SizesArrayGbl;
9054 } else {
9055 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9056 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9057 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9058 AllocaInst *Buffer = Builder.CreateAlloca(
9059 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9060 Buffer->setAlignment(OffloadSizeAlign);
9061 restoreIPandDebugLoc(Builder, CodeGenIP);
9062 Builder.CreateMemCpy(
9063 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9064 SizesArrayGbl, OffloadSizeAlign,
9065 Builder.getIntN(
9066 IndexSize,
9067 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9068
9069 Info.RTArgs.SizesArray = Buffer;
9070 }
9071 restoreIPandDebugLoc(Builder, CodeGenIP);
9072 }
9073
9074 // The map types are always constant so we don't need to generate code to
9075 // fill arrays. Instead, we create an array constant.
9077 for (auto mapFlag : CombinedInfo.Types)
9078 Mapping.push_back(
9079 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9080 mapFlag));
9081 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9082 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9083 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9084
9085 // The information types are only built if provided.
9086 if (!CombinedInfo.Names.empty()) {
9087 auto *MapNamesArrayGbl = createOffloadMapnames(
9088 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9089 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9090 Info.EmitDebug = true;
9091 } else {
9092 Info.RTArgs.MapNamesArray =
9093 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9094 Info.EmitDebug = false;
9095 }
9096
9097 // If there's a present map type modifier, it must not be applied to the end
9098 // of a region, so generate a separate map type array in that case.
9099 if (Info.separateBeginEndCalls()) {
9100 bool EndMapTypesDiffer = false;
9101 for (uint64_t &Type : Mapping) {
9102 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9103 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9104 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9105 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9106 EndMapTypesDiffer = true;
9107 }
9108 }
9109 if (EndMapTypesDiffer) {
9110 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9111 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9112 }
9113 }
9114
9115 PointerType *PtrTy = Builder.getPtrTy();
9116 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9117 Value *BPVal = CombinedInfo.BasePointers[I];
9118 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9119 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9120 0, I);
9121 Builder.CreateAlignedStore(BPVal, BP,
9122 M.getDataLayout().getPrefTypeAlign(PtrTy));
9123
9124 if (Info.requiresDevicePointerInfo()) {
9125 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9126 CodeGenIP = Builder.saveIP();
9127 Builder.restoreIP(AllocaIP);
9128 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9129 Builder.restoreIP(CodeGenIP);
9130 if (DeviceAddrCB)
9131 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9132 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9133 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9134 if (DeviceAddrCB)
9135 DeviceAddrCB(I, BP);
9136 }
9137 }
9138
9139 Value *PVal = CombinedInfo.Pointers[I];
9140 Value *P = Builder.CreateConstInBoundsGEP2_32(
9141 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9142 I);
9143 // TODO: Check alignment correct.
9144 Builder.CreateAlignedStore(PVal, P,
9145 M.getDataLayout().getPrefTypeAlign(PtrTy));
9146
9147 if (RuntimeSizes.test(I)) {
9148 Value *S = Builder.CreateConstInBoundsGEP2_32(
9149 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9150 /*Idx0=*/0,
9151 /*Idx1=*/I);
9152 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9153 Int64Ty,
9154 /*isSigned=*/true),
9155 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9156 }
9157 // Fill up the mapper array.
9158 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9159 Value *MFunc = ConstantPointerNull::get(PtrTy);
9160
9161 auto CustomMFunc = CustomMapperCB(I);
9162 if (!CustomMFunc)
9163 return CustomMFunc.takeError();
9164 if (*CustomMFunc)
9165 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9166
9167 Value *MAddr = Builder.CreateInBoundsGEP(
9168 MappersArray->getAllocatedType(), MappersArray,
9169 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9170 Builder.CreateAlignedStore(
9171 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9172 }
9173
9174 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9175 Info.NumberOfPtrs == 0)
9176 return Error::success();
9177 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9178 return Error::success();
9179}
9180
9181void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9182 BasicBlock *CurBB = Builder.GetInsertBlock();
9183
9184 if (!CurBB || CurBB->getTerminator()) {
9185 // If there is no insert point or the previous block is already
9186 // terminated, don't touch it.
9187 } else {
9188 // Otherwise, create a fall-through branch.
9189 Builder.CreateBr(Target);
9190 }
9191
9192 Builder.ClearInsertionPoint();
9193}
9194
9195void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9196 bool IsFinished) {
9197 BasicBlock *CurBB = Builder.GetInsertBlock();
9198
9199 // Fall out of the current block (if necessary).
9200 emitBranch(BB);
9201
9202 if (IsFinished && BB->use_empty()) {
9203 BB->eraseFromParent();
9204 return;
9205 }
9206
9207 // Place the block after the current block, if possible, or else at
9208 // the end of the function.
9209 if (CurBB && CurBB->getParent())
9210 CurFn->insert(std::next(CurBB->getIterator()), BB);
9211 else
9212 CurFn->insert(CurFn->end(), BB);
9213 Builder.SetInsertPoint(BB);
9214}
9215
9216Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9217 BodyGenCallbackTy ElseGen,
9218 InsertPointTy AllocaIP) {
9219 // If the condition constant folds and can be elided, try to avoid emitting
9220 // the condition and the dead arm of the if/else.
9221 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9222 auto CondConstant = CI->getSExtValue();
9223 if (CondConstant)
9224 return ThenGen(AllocaIP, Builder.saveIP());
9225
9226 return ElseGen(AllocaIP, Builder.saveIP());
9227 }
9228
9229 Function *CurFn = Builder.GetInsertBlock()->getParent();
9230
9231 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9232 // emit the conditional branch.
9233 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9234 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9235 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9236 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9237 // Emit the 'then' code.
9238 emitBlock(ThenBlock, CurFn);
9239 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9240 return Err;
9241 emitBranch(ContBlock);
9242 // Emit the 'else' code if present.
9243 // There is no need to emit line number for unconditional branch.
9244 emitBlock(ElseBlock, CurFn);
9245 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9246 return Err;
9247 // There is no need to emit line number for unconditional branch.
9248 emitBranch(ContBlock);
9249 // Emit the continuation block for code after the if.
9250 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9251 return Error::success();
9252}
9253
9254bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9255 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9258 "Unexpected Atomic Ordering.");
9259
9260 bool Flush = false;
9262
9263 switch (AK) {
9264 case Read:
9267 FlushAO = AtomicOrdering::Acquire;
9268 Flush = true;
9269 }
9270 break;
9271 case Write:
9272 case Compare:
9273 case Update:
9276 FlushAO = AtomicOrdering::Release;
9277 Flush = true;
9278 }
9279 break;
9280 case Capture:
9281 switch (AO) {
9283 FlushAO = AtomicOrdering::Acquire;
9284 Flush = true;
9285 break;
9287 FlushAO = AtomicOrdering::Release;
9288 Flush = true;
9289 break;
9293 Flush = true;
9294 break;
9295 default:
9296 // do nothing - leave silently.
9297 break;
9298 }
9299 }
9300
9301 if (Flush) {
9302 // Currently Flush RT call still doesn't take memory_ordering, so for when
9303 // that happens, this tries to do the resolution of which atomic ordering
9304 // to use with but issue the flush call
9305 // TODO: pass `FlushAO` after memory ordering support is added
9306 (void)FlushAO;
9307 emitFlush(Loc);
9308 }
9309
9310 // for AO == AtomicOrdering::Monotonic and all other case combinations
9311 // do nothing
9312 return Flush;
9313}
9314
9315OpenMPIRBuilder::InsertPointTy
9316OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9317 AtomicOpValue &X, AtomicOpValue &V,
9318 AtomicOrdering AO, InsertPointTy AllocaIP) {
9319 if (!updateToLocation(Loc))
9320 return Loc.IP;
9321
9322 assert(X.Var->getType()->isPointerTy() &&
9323 "OMP Atomic expects a pointer to target memory");
9324 Type *XElemTy = X.ElemTy;
9325 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9326 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9327 "OMP atomic read expected a scalar type");
9328
9329 Value *XRead = nullptr;
9330
9331 if (XElemTy->isIntegerTy()) {
9332 LoadInst *XLD =
9333 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9334 XLD->setAtomic(AO);
9335 XRead = cast<Value>(XLD);
9336 } else if (XElemTy->isStructTy()) {
9337 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9338 // target does not support `atomicrmw` of the size of the struct
9339 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9340 OldVal->setAtomic(AO);
9341 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9342 unsigned LoadSize =
9343 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9344 OpenMPIRBuilder::AtomicInfo atomicInfo(
9345 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9346 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9347 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9348 XRead = AtomicLoadRes.first;
9349 OldVal->eraseFromParent();
9350 } else {
9351 // We need to perform atomic op as integer
9352 IntegerType *IntCastTy =
9353 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9354 LoadInst *XLoad =
9355 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9356 XLoad->setAtomic(AO);
9357 if (XElemTy->isFloatingPointTy()) {
9358 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9359 } else {
9360 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9361 }
9362 }
9363 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9364 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9365 return Builder.saveIP();
9366}
9367
9368OpenMPIRBuilder::InsertPointTy
9369OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9370 AtomicOpValue &X, Value *Expr,
9371 AtomicOrdering AO, InsertPointTy AllocaIP) {
9372 if (!updateToLocation(Loc))
9373 return Loc.IP;
9374
9375 assert(X.Var->getType()->isPointerTy() &&
9376 "OMP Atomic expects a pointer to target memory");
9377 Type *XElemTy = X.ElemTy;
9378 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9379 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9380 "OMP atomic write expected a scalar type");
9381
9382 if (XElemTy->isIntegerTy()) {
9383 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9384 XSt->setAtomic(AO);
9385 } else if (XElemTy->isStructTy()) {
9386 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9387 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9388 unsigned LoadSize =
9389 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9390 OpenMPIRBuilder::AtomicInfo atomicInfo(
9391 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9392 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9393 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9394 OldVal->eraseFromParent();
9395 } else {
9396 // We need to bitcast and perform atomic op as integers
9397 IntegerType *IntCastTy =
9398 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9399 Value *ExprCast =
9400 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9401 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9402 XSt->setAtomic(AO);
9403 }
9404
9405 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9406 return Builder.saveIP();
9407}
9408
9409OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9410 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9411 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9412 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9413 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9414 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9415 if (!updateToLocation(Loc))
9416 return Loc.IP;
9417
9418 LLVM_DEBUG({
9419 Type *XTy = X.Var->getType();
9420 assert(XTy->isPointerTy() &&
9421 "OMP Atomic expects a pointer to target memory");
9422 Type *XElemTy = X.ElemTy;
9423 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9424 XElemTy->isPointerTy()) &&
9425 "OMP atomic update expected a scalar type");
9426 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9427 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9428 "OpenMP atomic does not support LT or GT operations");
9429 });
9430
9431 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9432 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9433 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9434 if (!AtomicResult)
9435 return AtomicResult.takeError();
9436 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9437 return Builder.saveIP();
9438}
9439
9440// FIXME: Duplicating AtomicExpand
9441Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9442 AtomicRMWInst::BinOp RMWOp) {
9443 switch (RMWOp) {
9444 case AtomicRMWInst::Add:
9445 return Builder.CreateAdd(Src1, Src2);
9446 case AtomicRMWInst::Sub:
9447 return Builder.CreateSub(Src1, Src2);
9448 case AtomicRMWInst::And:
9449 return Builder.CreateAnd(Src1, Src2);
9451 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9452 case AtomicRMWInst::Or:
9453 return Builder.CreateOr(Src1, Src2);
9454 case AtomicRMWInst::Xor:
9455 return Builder.CreateXor(Src1, Src2);
9460 case AtomicRMWInst::Max:
9461 case AtomicRMWInst::Min:
9472 llvm_unreachable("Unsupported atomic update operation");
9473 }
9474 llvm_unreachable("Unsupported atomic update operation");
9475}
9476
9477Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9478 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9480 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9481 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9482 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9483 // or a complex datatype.
9484 bool emitRMWOp = false;
9485 switch (RMWOp) {
9486 case AtomicRMWInst::Add:
9487 case AtomicRMWInst::And:
9489 case AtomicRMWInst::Or:
9490 case AtomicRMWInst::Xor:
9492 emitRMWOp = XElemTy;
9493 break;
9494 case AtomicRMWInst::Sub:
9495 emitRMWOp = (IsXBinopExpr && XElemTy);
9496 break;
9497 default:
9498 emitRMWOp = false;
9499 }
9500 emitRMWOp &= XElemTy->isIntegerTy();
9501
9502 std::pair<Value *, Value *> Res;
9503 if (emitRMWOp) {
9504 AtomicRMWInst *RMWInst =
9505 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9506 if (T.isAMDGPU()) {
9507 if (IsIgnoreDenormalMode)
9508 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9509 llvm::MDNode::get(Builder.getContext(), {}));
9510 if (!IsFineGrainedMemory)
9511 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9512 llvm::MDNode::get(Builder.getContext(), {}));
9513 if (!IsRemoteMemory)
9514 RMWInst->setMetadata("amdgpu.no.remote.memory",
9515 llvm::MDNode::get(Builder.getContext(), {}));
9516 }
9517 Res.first = RMWInst;
9518 // not needed except in case of postfix captures. Generate anyway for
9519 // consistency with the else part. Will be removed with any DCE pass.
9520 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9521 if (RMWOp == AtomicRMWInst::Xchg)
9522 Res.second = Res.first;
9523 else
9524 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9525 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9526 XElemTy->isStructTy()) {
9527 LoadInst *OldVal =
9528 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9529 OldVal->setAtomic(AO);
9530 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9531 unsigned LoadSize =
9532 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9533
9534 OpenMPIRBuilder::AtomicInfo atomicInfo(
9535 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9536 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9537 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9538 BasicBlock *CurBB = Builder.GetInsertBlock();
9539 Instruction *CurBBTI = CurBB->getTerminator();
9540 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9541 BasicBlock *ExitBB =
9542 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9543 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9544 X->getName() + ".atomic.cont");
9545 ContBB->getTerminator()->eraseFromParent();
9546 Builder.restoreIP(AllocaIP);
9547 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9548 NewAtomicAddr->setName(X->getName() + "x.new.val");
9549 Builder.SetInsertPoint(ContBB);
9550 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9551 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9552 Value *OldExprVal = PHI;
9553 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9554 if (!CBResult)
9555 return CBResult.takeError();
9556 Value *Upd = *CBResult;
9557 Builder.CreateStore(Upd, NewAtomicAddr);
9560 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9561 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9562 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9563 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9564 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9565 OldVal->eraseFromParent();
9566 Res.first = OldExprVal;
9567 Res.second = Upd;
9568
9569 if (UnreachableInst *ExitTI =
9571 CurBBTI->eraseFromParent();
9572 Builder.SetInsertPoint(ExitBB);
9573 } else {
9574 Builder.SetInsertPoint(ExitTI);
9575 }
9576 } else {
9577 IntegerType *IntCastTy =
9578 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9579 LoadInst *OldVal =
9580 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9581 OldVal->setAtomic(AO);
9582 // CurBB
9583 // | /---\
9584 // ContBB |
9585 // | \---/
9586 // ExitBB
9587 BasicBlock *CurBB = Builder.GetInsertBlock();
9588 Instruction *CurBBTI = CurBB->getTerminator();
9589 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9590 BasicBlock *ExitBB =
9591 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9592 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9593 X->getName() + ".atomic.cont");
9594 ContBB->getTerminator()->eraseFromParent();
9595 Builder.restoreIP(AllocaIP);
9596 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9597 NewAtomicAddr->setName(X->getName() + "x.new.val");
9598 Builder.SetInsertPoint(ContBB);
9599 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9600 PHI->addIncoming(OldVal, CurBB);
9601 bool IsIntTy = XElemTy->isIntegerTy();
9602 Value *OldExprVal = PHI;
9603 if (!IsIntTy) {
9604 if (XElemTy->isFloatingPointTy()) {
9605 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9606 X->getName() + ".atomic.fltCast");
9607 } else {
9608 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9609 X->getName() + ".atomic.ptrCast");
9610 }
9611 }
9612
9613 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9614 if (!CBResult)
9615 return CBResult.takeError();
9616 Value *Upd = *CBResult;
9617 Builder.CreateStore(Upd, NewAtomicAddr);
9618 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9621 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9622 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9623 Result->setVolatile(VolatileX);
9624 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9625 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9626 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9627 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9628
9629 Res.first = OldExprVal;
9630 Res.second = Upd;
9631
9632 // set Insertion point in exit block
9633 if (UnreachableInst *ExitTI =
9635 CurBBTI->eraseFromParent();
9636 Builder.SetInsertPoint(ExitBB);
9637 } else {
9638 Builder.SetInsertPoint(ExitTI);
9639 }
9640 }
9641
9642 return Res;
9643}
9644
9645OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9646 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9647 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9648 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9649 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9650 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9651 if (!updateToLocation(Loc))
9652 return Loc.IP;
9653
9654 LLVM_DEBUG({
9655 Type *XTy = X.Var->getType();
9656 assert(XTy->isPointerTy() &&
9657 "OMP Atomic expects a pointer to target memory");
9658 Type *XElemTy = X.ElemTy;
9659 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9660 XElemTy->isPointerTy()) &&
9661 "OMP atomic capture expected a scalar type");
9662 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9663 "OpenMP atomic does not support LT or GT operations");
9664 });
9665
9666 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9667 // 'x' is simply atomically rewritten with 'expr'.
9668 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9669 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9670 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9671 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9672 if (!AtomicResult)
9673 return AtomicResult.takeError();
9674 Value *CapturedVal =
9675 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9676 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9677
9678 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9679 return Builder.saveIP();
9680}
9681
9682OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9683 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9684 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9685 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9686 bool IsFailOnly) {
9687
9689 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9690 IsPostfixUpdate, IsFailOnly, Failure);
9691}
9692
9693OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9694 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9695 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9696 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9697 bool IsFailOnly, AtomicOrdering Failure) {
9698
9699 if (!updateToLocation(Loc))
9700 return Loc.IP;
9701
9702 assert(X.Var->getType()->isPointerTy() &&
9703 "OMP atomic expects a pointer to target memory");
9704 // compare capture
9705 if (V.Var) {
9706 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9707 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9708 }
9709
9710 bool IsInteger = E->getType()->isIntegerTy();
9711
9712 if (Op == OMPAtomicCompareOp::EQ) {
9713 AtomicCmpXchgInst *Result = nullptr;
9714 if (!IsInteger) {
9715 IntegerType *IntCastTy =
9716 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9717 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9718 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9719 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9720 AO, Failure);
9721 } else {
9722 Result =
9723 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9724 }
9725
9726 if (V.Var) {
9727 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9728 if (!IsInteger)
9729 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9730 assert(OldValue->getType() == V.ElemTy &&
9731 "OldValue and V must be of same type");
9732 if (IsPostfixUpdate) {
9733 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9734 } else {
9735 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9736 if (IsFailOnly) {
9737 // CurBB----
9738 // | |
9739 // v |
9740 // ContBB |
9741 // | |
9742 // v |
9743 // ExitBB <-
9744 //
9745 // where ContBB only contains the store of old value to 'v'.
9746 BasicBlock *CurBB = Builder.GetInsertBlock();
9747 Instruction *CurBBTI = CurBB->getTerminator();
9748 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9749 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9750 CurBBTI, X.Var->getName() + ".atomic.exit");
9751 BasicBlock *ContBB = CurBB->splitBasicBlock(
9752 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9753 ContBB->getTerminator()->eraseFromParent();
9754 CurBB->getTerminator()->eraseFromParent();
9755
9756 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9757
9758 Builder.SetInsertPoint(ContBB);
9759 Builder.CreateStore(OldValue, V.Var);
9760 Builder.CreateBr(ExitBB);
9761
9762 if (UnreachableInst *ExitTI =
9764 CurBBTI->eraseFromParent();
9765 Builder.SetInsertPoint(ExitBB);
9766 } else {
9767 Builder.SetInsertPoint(ExitTI);
9768 }
9769 } else {
9770 Value *CapturedValue =
9771 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9772 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9773 }
9774 }
9775 }
9776 // The comparison result has to be stored.
9777 if (R.Var) {
9778 assert(R.Var->getType()->isPointerTy() &&
9779 "r.var must be of pointer type");
9780 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9781
9782 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9783 Value *ResultCast = R.IsSigned
9784 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9785 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9786 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9787 }
9788 } else {
9789 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9790 "Op should be either max or min at this point");
9791 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9792
9793 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9794 // Let's take max as example.
9795 // OpenMP form:
9796 // x = x > expr ? expr : x;
9797 // LLVM form:
9798 // *ptr = *ptr > val ? *ptr : val;
9799 // We need to transform to LLVM form.
9800 // x = x <= expr ? x : expr;
9802 if (IsXBinopExpr) {
9803 if (IsInteger) {
9804 if (X.IsSigned)
9805 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9807 else
9808 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9810 } else {
9811 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9813 }
9814 } else {
9815 if (IsInteger) {
9816 if (X.IsSigned)
9817 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9819 else
9820 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9822 } else {
9823 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9825 }
9826 }
9827
9828 AtomicRMWInst *OldValue =
9829 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9830 if (V.Var) {
9831 Value *CapturedValue = nullptr;
9832 if (IsPostfixUpdate) {
9833 CapturedValue = OldValue;
9834 } else {
9835 CmpInst::Predicate Pred;
9836 switch (NewOp) {
9837 case AtomicRMWInst::Max:
9838 Pred = CmpInst::ICMP_SGT;
9839 break;
9841 Pred = CmpInst::ICMP_UGT;
9842 break;
9844 Pred = CmpInst::FCMP_OGT;
9845 break;
9846 case AtomicRMWInst::Min:
9847 Pred = CmpInst::ICMP_SLT;
9848 break;
9850 Pred = CmpInst::ICMP_ULT;
9851 break;
9853 Pred = CmpInst::FCMP_OLT;
9854 break;
9855 default:
9856 llvm_unreachable("unexpected comparison op");
9857 }
9858 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9859 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9860 }
9861 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9862 }
9863 }
9864
9865 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9866
9867 return Builder.saveIP();
9868}
9869
9870OpenMPIRBuilder::InsertPointOrErrorTy
9871OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9872 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9873 Value *NumTeamsUpper, Value *ThreadLimit,
9874 Value *IfExpr) {
9875 if (!updateToLocation(Loc))
9876 return InsertPointTy();
9877
9878 uint32_t SrcLocStrSize;
9879 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9880 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9881 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9882
9883 // Outer allocation basicblock is the entry block of the current function.
9884 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9885 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9886 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9887 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9888 }
9889
9890 // The current basic block is split into four basic blocks. After outlining,
9891 // they will be mapped as follows:
9892 // ```
9893 // def current_fn() {
9894 // current_basic_block:
9895 // br label %teams.exit
9896 // teams.exit:
9897 // ; instructions after teams
9898 // }
9899 //
9900 // def outlined_fn() {
9901 // teams.alloca:
9902 // br label %teams.body
9903 // teams.body:
9904 // ; instructions within teams body
9905 // }
9906 // ```
9907 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9908 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9909 BasicBlock *AllocaBB =
9910 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9911
9912 bool SubClausesPresent =
9913 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9914 // Push num_teams
9915 if (!Config.isTargetDevice() && SubClausesPresent) {
9916 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9917 "if lowerbound is non-null, then upperbound must also be non-null "
9918 "for bounds on num_teams");
9919
9920 if (NumTeamsUpper == nullptr)
9921 NumTeamsUpper = Builder.getInt32(0);
9922
9923 if (NumTeamsLower == nullptr)
9924 NumTeamsLower = NumTeamsUpper;
9925
9926 if (IfExpr) {
9927 assert(IfExpr->getType()->isIntegerTy() &&
9928 "argument to if clause must be an integer value");
9929
9930 // upper = ifexpr ? upper : 1
9931 if (IfExpr->getType() != Int1)
9932 IfExpr = Builder.CreateICmpNE(IfExpr,
9933 ConstantInt::get(IfExpr->getType(), 0));
9934 NumTeamsUpper = Builder.CreateSelect(
9935 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9936
9937 // lower = ifexpr ? lower : 1
9938 NumTeamsLower = Builder.CreateSelect(
9939 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9940 }
9941
9942 if (ThreadLimit == nullptr)
9943 ThreadLimit = Builder.getInt32(0);
9944
9945 Value *ThreadNum = getOrCreateThreadID(Ident);
9946 Builder.CreateCall(
9947 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9948 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9949 }
9950 // Generate the body of teams.
9951 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9952 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9953 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9954 return Err;
9955
9956 OutlineInfo OI;
9957 OI.EntryBB = AllocaBB;
9958 OI.ExitBB = ExitBB;
9959 OI.OuterAllocaBB = &OuterAllocaBB;
9960
9961 // Insert fake values for global tid and bound tid.
9963 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9964 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9965 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9966 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9967 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9968
9969 auto HostPostOutlineCB = [this, Ident,
9970 ToBeDeleted](Function &OutlinedFn) mutable {
9971 // The stale call instruction will be replaced with a new call instruction
9972 // for runtime call with the outlined function.
9973
9974 assert(OutlinedFn.hasOneUse() &&
9975 "there must be a single user for the outlined function");
9976 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9977 ToBeDeleted.push_back(StaleCI);
9978
9979 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9980 "Outlined function must have two or three arguments only");
9981
9982 bool HasShared = OutlinedFn.arg_size() == 3;
9983
9984 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9985 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9986 if (HasShared)
9987 OutlinedFn.getArg(2)->setName("data");
9988
9989 // Call to the runtime function for teams in the current function.
9990 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9991 "outlined function.");
9992 Builder.SetInsertPoint(StaleCI);
9994 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9995 if (HasShared)
9996 Args.push_back(StaleCI->getArgOperand(2));
9997 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9998 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9999 Args);
10000
10001 for (Instruction *I : llvm::reverse(ToBeDeleted))
10002 I->eraseFromParent();
10003 };
10004
10005 if (!Config.isTargetDevice())
10006 OI.PostOutlineCB = HostPostOutlineCB;
10007
10008 addOutlineInfo(std::move(OI));
10009
10010 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10011
10012 return Builder.saveIP();
10013}
10014
10015OpenMPIRBuilder::InsertPointOrErrorTy
10016OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10017 InsertPointTy OuterAllocaIP,
10018 BodyGenCallbackTy BodyGenCB) {
10019 if (!updateToLocation(Loc))
10020 return InsertPointTy();
10021
10022 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10023
10024 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10025 BasicBlock *BodyBB =
10026 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10027 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10028 }
10029 BasicBlock *ExitBB =
10030 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10031 BasicBlock *BodyBB =
10032 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10033 BasicBlock *AllocaBB =
10034 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10035
10036 // Generate the body of distribute clause
10037 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10038 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10039 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10040 return Err;
10041
10042 // When using target we use different runtime functions which require a
10043 // callback.
10044 if (Config.isTargetDevice()) {
10045 OutlineInfo OI;
10046 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10047 OI.EntryBB = AllocaBB;
10048 OI.ExitBB = ExitBB;
10049
10050 addOutlineInfo(std::move(OI));
10051 }
10052 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10053
10054 return Builder.saveIP();
10055}
10056
10058OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10059 std::string VarName) {
10060 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10062 Names.size()),
10063 Names);
10064 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10065 M, MapNamesArrayInit->getType(),
10066 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10067 VarName);
10068 return MapNamesArrayGlobal;
10069}
10070
10071// Create all simple and struct types exposed by the runtime and remember
10072// the llvm::PointerTypes of them for easy access later.
10073void OpenMPIRBuilder::initializeTypes(Module &M) {
10074 LLVMContext &Ctx = M.getContext();
10075 StructType *T;
10076 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10077 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10078#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10079#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10080 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10081 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10082#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10083 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10084 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10085#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10086 T = StructType::getTypeByName(Ctx, StructName); \
10087 if (!T) \
10088 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10089 VarName = T; \
10090 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10091#include "llvm/Frontend/OpenMP/OMPKinds.def"
10092}
10093
10094void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10096 SmallVectorImpl<BasicBlock *> &BlockVector) {
10098 BlockSet.insert(EntryBB);
10099 BlockSet.insert(ExitBB);
10100
10101 Worklist.push_back(EntryBB);
10102 while (!Worklist.empty()) {
10103 BasicBlock *BB = Worklist.pop_back_val();
10104 BlockVector.push_back(BB);
10105 for (BasicBlock *SuccBB : successors(BB))
10106 if (BlockSet.insert(SuccBB).second)
10107 Worklist.push_back(SuccBB);
10108 }
10109}
10110
10111void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10112 uint64_t Size, int32_t Flags,
10114 StringRef Name) {
10115 if (!Config.isGPU()) {
10118 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10119 return;
10120 }
10121 // TODO: Add support for global variables on the device after declare target
10122 // support.
10123 Function *Fn = dyn_cast<Function>(Addr);
10124 if (!Fn)
10125 return;
10126
10127 // Add a function attribute for the kernel.
10128 Fn->addFnAttr("kernel");
10129 if (T.isAMDGCN())
10130 Fn->addFnAttr("uniform-work-group-size", "true");
10131 Fn->addFnAttr(Attribute::MustProgress);
10132}
10133
10134// We only generate metadata for function that contain target regions.
10135void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10136 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10137
10138 // If there are no entries, we don't need to do anything.
10139 if (OffloadInfoManager.empty())
10140 return;
10141
10142 LLVMContext &C = M.getContext();
10143 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10144 TargetRegionEntryInfo>,
10145 16>
10146 OrderedEntries(OffloadInfoManager.size());
10147
10148 // Auxiliary methods to create metadata values and strings.
10149 auto &&GetMDInt = [this](unsigned V) {
10150 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10151 };
10152
10153 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10154
10155 // Create the offloading info metadata node.
10156 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10157 auto &&TargetRegionMetadataEmitter =
10158 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10159 const TargetRegionEntryInfo &EntryInfo,
10160 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10161 // Generate metadata for target regions. Each entry of this metadata
10162 // contains:
10163 // - Entry 0 -> Kind of this type of metadata (0).
10164 // - Entry 1 -> Device ID of the file where the entry was identified.
10165 // - Entry 2 -> File ID of the file where the entry was identified.
10166 // - Entry 3 -> Mangled name of the function where the entry was
10167 // identified.
10168 // - Entry 4 -> Line in the file where the entry was identified.
10169 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10170 // - Entry 6 -> Order the entry was created.
10171 // The first element of the metadata node is the kind.
10172 Metadata *Ops[] = {
10173 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10174 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10175 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10176 GetMDInt(E.getOrder())};
10177
10178 // Save this entry in the right position of the ordered entries array.
10179 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10180
10181 // Add metadata to the named metadata node.
10182 MD->addOperand(MDNode::get(C, Ops));
10183 };
10184
10185 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10186
10187 // Create function that emits metadata for each device global variable entry;
10188 auto &&DeviceGlobalVarMetadataEmitter =
10189 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10190 StringRef MangledName,
10191 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10192 // Generate metadata for global variables. Each entry of this metadata
10193 // contains:
10194 // - Entry 0 -> Kind of this type of metadata (1).
10195 // - Entry 1 -> Mangled name of the variable.
10196 // - Entry 2 -> Declare target kind.
10197 // - Entry 3 -> Order the entry was created.
10198 // The first element of the metadata node is the kind.
10199 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10200 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10201
10202 // Save this entry in the right position of the ordered entries array.
10203 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10204 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10205
10206 // Add metadata to the named metadata node.
10207 MD->addOperand(MDNode::get(C, Ops));
10208 };
10209
10210 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10211 DeviceGlobalVarMetadataEmitter);
10212
10213 for (const auto &E : OrderedEntries) {
10214 assert(E.first && "All ordered entries must exist!");
10215 if (const auto *CE =
10217 E.first)) {
10218 if (!CE->getID() || !CE->getAddress()) {
10219 // Do not blame the entry if the parent funtion is not emitted.
10220 TargetRegionEntryInfo EntryInfo = E.second;
10221 StringRef FnName = EntryInfo.ParentName;
10222 if (!M.getNamedValue(FnName))
10223 continue;
10224 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10225 continue;
10226 }
10227 createOffloadEntry(CE->getID(), CE->getAddress(),
10228 /*Size=*/0, CE->getFlags(),
10230 } else if (const auto *CE = dyn_cast<
10231 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10232 E.first)) {
10233 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10234 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10235 CE->getFlags());
10236 switch (Flags) {
10237 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10238 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10239 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10240 continue;
10241 if (!CE->getAddress()) {
10242 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10243 continue;
10244 }
10245 // The vaiable has no definition - no need to add the entry.
10246 if (CE->getVarSize() == 0)
10247 continue;
10248 break;
10249 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10250 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10251 (!Config.isTargetDevice() && CE->getAddress())) &&
10252 "Declaret target link address is set.");
10253 if (Config.isTargetDevice())
10254 continue;
10255 if (!CE->getAddress()) {
10256 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10257 continue;
10258 }
10259 break;
10260 default:
10261 break;
10262 }
10263
10264 // Hidden or internal symbols on the device are not externally visible.
10265 // We should not attempt to register them by creating an offloading
10266 // entry. Indirect variables are handled separately on the device.
10267 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10268 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10269 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10270 continue;
10271
10272 // Indirect globals need to use a special name that doesn't match the name
10273 // of the associated host global.
10274 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10275 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10276 Flags, CE->getLinkage(), CE->getVarName());
10277 else
10278 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10279 Flags, CE->getLinkage());
10280
10281 } else {
10282 llvm_unreachable("Unsupported entry kind.");
10283 }
10284 }
10285
10286 // Emit requires directive globals to a special entry so the runtime can
10287 // register them when the device image is loaded.
10288 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10289 // entries should be redesigned to better suit this use-case.
10290 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10294 ".requires", /*Size=*/0,
10295 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10296 Config.getRequiresFlags());
10297}
10298
10299void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10300 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10301 unsigned FileID, unsigned Line, unsigned Count) {
10302 raw_svector_ostream OS(Name);
10303 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10304 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10305 if (Count)
10306 OS << "_" << Count;
10307}
10308
10309void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10310 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10311 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10312 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10313 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10314 EntryInfo.Line, NewCount);
10315}
10316
10317TargetRegionEntryInfo
10318OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10319 vfs::FileSystem &VFS,
10320 StringRef ParentName) {
10321 sys::fs::UniqueID ID(0xdeadf17e, 0);
10322 auto FileIDInfo = CallBack();
10323 uint64_t FileID = 0;
10324 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10325 ID = Status->getUniqueID();
10326 FileID = Status->getUniqueID().getFile();
10327 } else {
10328 // If the inode ID could not be determined, create a hash value
10329 // the current file name and use that as an ID.
10330 FileID = hash_value(std::get<0>(FileIDInfo));
10331 }
10332
10333 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10334 std::get<1>(FileIDInfo));
10335}
10336
10337unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10338 unsigned Offset = 0;
10339 for (uint64_t Remain =
10340 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10342 !(Remain & 1); Remain = Remain >> 1)
10343 Offset++;
10344 return Offset;
10345}
10346
10348OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10349 // Rotate by getFlagMemberOffset() bits.
10350 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10351 << getFlagMemberOffset());
10352}
10353
10354void OpenMPIRBuilder::setCorrectMemberOfFlag(
10356 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10357 // If the entry is PTR_AND_OBJ but has not been marked with the special
10358 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10359 // marked as MEMBER_OF.
10360 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10362 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10365 return;
10366
10367 // Reset the placeholder value to prepare the flag for the assignment of the
10368 // proper MEMBER_OF value.
10369 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10370 Flags |= MemberOfFlag;
10371}
10372
10373Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10374 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10375 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10376 bool IsDeclaration, bool IsExternallyVisible,
10377 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10378 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10379 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10380 std::function<Constant *()> GlobalInitializer,
10381 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10382 // TODO: convert this to utilise the IRBuilder Config rather than
10383 // a passed down argument.
10384 if (OpenMPSIMD)
10385 return nullptr;
10386
10387 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10388 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10389 CaptureClause ==
10390 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10391 Config.hasRequiresUnifiedSharedMemory())) {
10392 SmallString<64> PtrName;
10393 {
10394 raw_svector_ostream OS(PtrName);
10395 OS << MangledName;
10396 if (!IsExternallyVisible)
10397 OS << format("_%x", EntryInfo.FileID);
10398 OS << "_decl_tgt_ref_ptr";
10399 }
10400
10401 Value *Ptr = M.getNamedValue(PtrName);
10402
10403 if (!Ptr) {
10404 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10405 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10406
10407 auto *GV = cast<GlobalVariable>(Ptr);
10408 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10409
10410 if (!Config.isTargetDevice()) {
10411 if (GlobalInitializer)
10412 GV->setInitializer(GlobalInitializer());
10413 else
10414 GV->setInitializer(GlobalValue);
10415 }
10416
10417 registerTargetGlobalVariable(
10418 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10419 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10420 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10421 }
10422
10423 return cast<Constant>(Ptr);
10424 }
10425
10426 return nullptr;
10427}
10428
10429void OpenMPIRBuilder::registerTargetGlobalVariable(
10430 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10431 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10432 bool IsDeclaration, bool IsExternallyVisible,
10433 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10434 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10435 std::vector<Triple> TargetTriple,
10436 std::function<Constant *()> GlobalInitializer,
10437 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10438 Constant *Addr) {
10439 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10440 (TargetTriple.empty() && !Config.isTargetDevice()))
10441 return;
10442
10443 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10445 int64_t VarSize;
10447
10448 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10449 CaptureClause ==
10450 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10451 !Config.hasRequiresUnifiedSharedMemory()) {
10452 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10453 VarName = MangledName;
10454 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10455
10456 if (!IsDeclaration)
10457 VarSize = divideCeil(
10458 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10459 else
10460 VarSize = 0;
10461 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10462
10463 // This is a workaround carried over from Clang which prevents undesired
10464 // optimisation of internal variables.
10465 if (Config.isTargetDevice() &&
10466 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10467 // Do not create a "ref-variable" if the original is not also available
10468 // on the host.
10469 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10470 return;
10471
10472 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10473
10474 if (!M.getNamedValue(RefName)) {
10475 Constant *AddrRef =
10476 getOrCreateInternalVariable(Addr->getType(), RefName);
10477 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10478 GvAddrRef->setConstant(true);
10479 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10480 GvAddrRef->setInitializer(Addr);
10481 GeneratedRefs.push_back(GvAddrRef);
10482 }
10483 }
10484 } else {
10485 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10486 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10487 else
10488 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10489
10490 if (Config.isTargetDevice()) {
10491 VarName = (Addr) ? Addr->getName() : "";
10492 Addr = nullptr;
10493 } else {
10494 Addr = getAddrOfDeclareTargetVar(
10495 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10496 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10497 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10498 VarName = (Addr) ? Addr->getName() : "";
10499 }
10500 VarSize = M.getDataLayout().getPointerSize();
10502 }
10503
10504 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10505 Flags, Linkage);
10506}
10507
10508/// Loads all the offload entries information from the host IR
10509/// metadata.
10510void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10511 // If we are in target mode, load the metadata from the host IR. This code has
10512 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10513
10514 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10515 if (!MD)
10516 return;
10517
10518 for (MDNode *MN : MD->operands()) {
10519 auto &&GetMDInt = [MN](unsigned Idx) {
10520 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10521 return cast<ConstantInt>(V->getValue())->getZExtValue();
10522 };
10523
10524 auto &&GetMDString = [MN](unsigned Idx) {
10525 auto *V = cast<MDString>(MN->getOperand(Idx));
10526 return V->getString();
10527 };
10528
10529 switch (GetMDInt(0)) {
10530 default:
10531 llvm_unreachable("Unexpected metadata!");
10532 break;
10533 case OffloadEntriesInfoManager::OffloadEntryInfo::
10534 OffloadingEntryInfoTargetRegion: {
10535 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10536 /*DeviceID=*/GetMDInt(1),
10537 /*FileID=*/GetMDInt(2),
10538 /*Line=*/GetMDInt(4),
10539 /*Count=*/GetMDInt(5));
10540 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10541 /*Order=*/GetMDInt(6));
10542 break;
10543 }
10544 case OffloadEntriesInfoManager::OffloadEntryInfo::
10545 OffloadingEntryInfoDeviceGlobalVar:
10546 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10547 /*MangledName=*/GetMDString(1),
10548 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10549 /*Flags=*/GetMDInt(2)),
10550 /*Order=*/GetMDInt(3));
10551 break;
10552 }
10553 }
10554}
10555
10556void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10557 StringRef HostFilePath) {
10558 if (HostFilePath.empty())
10559 return;
10560
10561 auto Buf = VFS.getBufferForFile(HostFilePath);
10562 if (std::error_code Err = Buf.getError()) {
10563 report_fatal_error(("error opening host file from host file path inside of "
10564 "OpenMPIRBuilder: " +
10565 Err.message())
10566 .c_str());
10567 }
10568
10569 LLVMContext Ctx;
10571 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10572 if (std::error_code Err = M.getError()) {
10574 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10575 .c_str());
10576 }
10577
10578 loadOffloadInfoMetadata(*M.get());
10579}
10580
10581//===----------------------------------------------------------------------===//
10582// OffloadEntriesInfoManager
10583//===----------------------------------------------------------------------===//
10584
10585bool OffloadEntriesInfoManager::empty() const {
10586 return OffloadEntriesTargetRegion.empty() &&
10587 OffloadEntriesDeviceGlobalVar.empty();
10588}
10589
10590unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10591 const TargetRegionEntryInfo &EntryInfo) const {
10592 auto It = OffloadEntriesTargetRegionCount.find(
10593 getTargetRegionEntryCountKey(EntryInfo));
10594 if (It == OffloadEntriesTargetRegionCount.end())
10595 return 0;
10596 return It->second;
10597}
10598
10599void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10600 const TargetRegionEntryInfo &EntryInfo) {
10601 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10602 EntryInfo.Count + 1;
10603}
10604
10605/// Initialize target region entry.
10606void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10607 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10608 OffloadEntriesTargetRegion[EntryInfo] =
10609 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10610 OMPTargetRegionEntryTargetRegion);
10611 ++OffloadingEntriesNum;
10612}
10613
10614void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10615 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10616 OMPTargetRegionEntryKind Flags) {
10617 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10618
10619 // Update the EntryInfo with the next available count for this location.
10620 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10621
10622 // If we are emitting code for a target, the entry is already initialized,
10623 // only has to be registered.
10624 if (OMPBuilder->Config.isTargetDevice()) {
10625 // This could happen if the device compilation is invoked standalone.
10626 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10627 return;
10628 }
10629 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10630 Entry.setAddress(Addr);
10631 Entry.setID(ID);
10632 Entry.setFlags(Flags);
10633 } else {
10634 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10635 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10636 return;
10637 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10638 "Target region entry already registered!");
10639 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10640 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10641 ++OffloadingEntriesNum;
10642 }
10643 incrementTargetRegionEntryInfoCount(EntryInfo);
10644}
10645
10646bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10647 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10648
10649 // Update the EntryInfo with the next available count for this location.
10650 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10651
10652 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10653 if (It == OffloadEntriesTargetRegion.end()) {
10654 return false;
10655 }
10656 // Fail if this entry is already registered.
10657 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10658 return false;
10659 return true;
10660}
10661
10662void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10663 const OffloadTargetRegionEntryInfoActTy &Action) {
10664 // Scan all target region entries and perform the provided action.
10665 for (const auto &It : OffloadEntriesTargetRegion) {
10666 Action(It.first, It.second);
10667 }
10668}
10669
10670void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10671 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10672 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10673 ++OffloadingEntriesNum;
10674}
10675
10676void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10677 StringRef VarName, Constant *Addr, int64_t VarSize,
10678 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10679 if (OMPBuilder->Config.isTargetDevice()) {
10680 // This could happen if the device compilation is invoked standalone.
10681 if (!hasDeviceGlobalVarEntryInfo(VarName))
10682 return;
10683 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10684 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10685 if (Entry.getVarSize() == 0) {
10686 Entry.setVarSize(VarSize);
10687 Entry.setLinkage(Linkage);
10688 }
10689 return;
10690 }
10691 Entry.setVarSize(VarSize);
10692 Entry.setLinkage(Linkage);
10693 Entry.setAddress(Addr);
10694 } else {
10695 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10696 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10697 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10698 "Entry not initialized!");
10699 if (Entry.getVarSize() == 0) {
10700 Entry.setVarSize(VarSize);
10701 Entry.setLinkage(Linkage);
10702 }
10703 return;
10704 }
10705 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10706 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10707 Addr, VarSize, Flags, Linkage,
10708 VarName.str());
10709 else
10710 OffloadEntriesDeviceGlobalVar.try_emplace(
10711 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10712 ++OffloadingEntriesNum;
10713 }
10714}
10715
10716void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10717 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10718 // Scan all target region entries and perform the provided action.
10719 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10720 Action(E.getKey(), E.getValue());
10721}
10722
10723//===----------------------------------------------------------------------===//
10724// CanonicalLoopInfo
10725//===----------------------------------------------------------------------===//
10726
10727void CanonicalLoopInfo::collectControlBlocks(
10729 // We only count those BBs as control block for which we do not need to
10730 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10731 // flow. For consistency, this also means we do not add the Body block, which
10732 // is just the entry to the body code.
10733 BBs.reserve(BBs.size() + 6);
10734 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10735}
10736
10737BasicBlock *CanonicalLoopInfo::getPreheader() const {
10738 assert(isValid() && "Requires a valid canonical loop");
10739 for (BasicBlock *Pred : predecessors(Header)) {
10740 if (Pred != Latch)
10741 return Pred;
10742 }
10743 llvm_unreachable("Missing preheader");
10744}
10745
10746void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10747 assert(isValid() && "Requires a valid canonical loop");
10748
10749 Instruction *CmpI = &getCond()->front();
10750 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10751 CmpI->setOperand(1, TripCount);
10752
10753#ifndef NDEBUG
10754 assertOK();
10755#endif
10756}
10757
10758void CanonicalLoopInfo::mapIndVar(
10759 llvm::function_ref<Value *(Instruction *)> Updater) {
10760 assert(isValid() && "Requires a valid canonical loop");
10761
10762 Instruction *OldIV = getIndVar();
10763
10764 // Record all uses excluding those introduced by the updater. Uses by the
10765 // CanonicalLoopInfo itself to keep track of the number of iterations are
10766 // excluded.
10767 SmallVector<Use *> ReplacableUses;
10768 for (Use &U : OldIV->uses()) {
10769 auto *User = dyn_cast<Instruction>(U.getUser());
10770 if (!User)
10771 continue;
10772 if (User->getParent() == getCond())
10773 continue;
10774 if (User->getParent() == getLatch())
10775 continue;
10776 ReplacableUses.push_back(&U);
10777 }
10778
10779 // Run the updater that may introduce new uses
10780 Value *NewIV = Updater(OldIV);
10781
10782 // Replace the old uses with the value returned by the updater.
10783 for (Use *U : ReplacableUses)
10784 U->set(NewIV);
10785
10786#ifndef NDEBUG
10787 assertOK();
10788#endif
10789}
10790
10791void CanonicalLoopInfo::assertOK() const {
10792#ifndef NDEBUG
10793 // No constraints if this object currently does not describe a loop.
10794 if (!isValid())
10795 return;
10796
10797 BasicBlock *Preheader = getPreheader();
10798 BasicBlock *Body = getBody();
10799 BasicBlock *After = getAfter();
10800
10801 // Verify standard control-flow we use for OpenMP loops.
10802 assert(Preheader);
10803 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10804 "Preheader must terminate with unconditional branch");
10805 assert(Preheader->getSingleSuccessor() == Header &&
10806 "Preheader must jump to header");
10807
10808 assert(Header);
10809 assert(isa<BranchInst>(Header->getTerminator()) &&
10810 "Header must terminate with unconditional branch");
10811 assert(Header->getSingleSuccessor() == Cond &&
10812 "Header must jump to exiting block");
10813
10814 assert(Cond);
10815 assert(Cond->getSinglePredecessor() == Header &&
10816 "Exiting block only reachable from header");
10817
10818 assert(isa<BranchInst>(Cond->getTerminator()) &&
10819 "Exiting block must terminate with conditional branch");
10820 assert(size(successors(Cond)) == 2 &&
10821 "Exiting block must have two successors");
10822 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10823 "Exiting block's first successor jump to the body");
10824 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10825 "Exiting block's second successor must exit the loop");
10826
10827 assert(Body);
10828 assert(Body->getSinglePredecessor() == Cond &&
10829 "Body only reachable from exiting block");
10830 assert(!isa<PHINode>(Body->front()));
10831
10832 assert(Latch);
10834 "Latch must terminate with unconditional branch");
10835 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10836 // TODO: To support simple redirecting of the end of the body code that has
10837 // multiple; introduce another auxiliary basic block like preheader and after.
10838 assert(Latch->getSinglePredecessor() != nullptr);
10839 assert(!isa<PHINode>(Latch->front()));
10840
10841 assert(Exit);
10842 assert(isa<BranchInst>(Exit->getTerminator()) &&
10843 "Exit block must terminate with unconditional branch");
10844 assert(Exit->getSingleSuccessor() == After &&
10845 "Exit block must jump to after block");
10846
10847 assert(After);
10848 assert(After->getSinglePredecessor() == Exit &&
10849 "After block only reachable from exit block");
10850 assert(After->empty() || !isa<PHINode>(After->front()));
10851
10852 Instruction *IndVar = getIndVar();
10853 assert(IndVar && "Canonical induction variable not found?");
10854 assert(isa<IntegerType>(IndVar->getType()) &&
10855 "Induction variable must be an integer");
10856 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10857 "Induction variable must be a PHI in the loop header");
10858 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10859 assert(
10860 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10861 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10862
10863 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10864 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10865 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10866 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10867 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10868 ->isOne());
10869
10870 Value *TripCount = getTripCount();
10871 assert(TripCount && "Loop trip count not found?");
10872 assert(IndVar->getType() == TripCount->getType() &&
10873 "Trip count and induction variable must have the same type");
10874
10875 auto *CmpI = cast<CmpInst>(&Cond->front());
10876 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10877 "Exit condition must be a signed less-than comparison");
10878 assert(CmpI->getOperand(0) == IndVar &&
10879 "Exit condition must compare the induction variable");
10880 assert(CmpI->getOperand(1) == TripCount &&
10881 "Exit condition must compare with the trip count");
10882#endif
10883}
10884
10885void CanonicalLoopInfo::invalidate() {
10886 Header = nullptr;
10887 Cond = nullptr;
10888 Latch = nullptr;
10889 Exit = nullptr;
10890}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:229
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1118
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:345
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:677
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:118
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...