LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr const size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510 Builder.CreateStore(Reduced, LHSPtr);
3511 }
3512 }
3513
3514 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3515 for (auto En : enumerate(ReductionInfos)) {
3516 unsigned Index = En.index();
3517 const ReductionInfo &RI = En.value();
3518 Value *LHSFixupPtr, *RHSFixupPtr;
3519 Builder.restoreIP(RI.ReductionGenClang(
3520 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3521
3522 // Fix the CallBack code genereated to use the correct Values for the LHS
3523 // and RHS
3524 LHSFixupPtr->replaceUsesWithIf(
3525 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3526 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3527 ReductionFunc;
3528 });
3529 RHSFixupPtr->replaceUsesWithIf(
3530 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3531 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3532 ReductionFunc;
3533 });
3534 }
3535
3536 Builder.CreateRetVoid();
3537 return ReductionFunc;
3538}
3539
3540static void
3542 bool IsGPU) {
3543 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3544 (void)RI;
3545 assert(RI.Variable && "expected non-null variable");
3546 assert(RI.PrivateVariable && "expected non-null private variable");
3547 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3548 "expected non-null reduction generator callback");
3549 if (!IsGPU) {
3550 assert(
3551 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3552 "expected variables and their private equivalents to have the same "
3553 "type");
3554 }
3555 assert(RI.Variable->getType()->isPointerTy() &&
3556 "expected variables to be pointers");
3557 }
3558}
3559
3560OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3561 const LocationDescription &Loc, InsertPointTy AllocaIP,
3562 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3563 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3564 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3565 Value *SrcLocInfo) {
3566 if (!updateToLocation(Loc))
3567 return InsertPointTy();
3568 Builder.restoreIP(CodeGenIP);
3569 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3570 LLVMContext &Ctx = M.getContext();
3571
3572 // Source location for the ident struct
3573 if (!SrcLocInfo) {
3574 uint32_t SrcLocStrSize;
3575 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3576 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3577 }
3578
3579 if (ReductionInfos.size() == 0)
3580 return Builder.saveIP();
3581
3582 BasicBlock *ContinuationBlock = nullptr;
3583 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3584 // Copied code from createReductions
3585 BasicBlock *InsertBlock = Loc.IP.getBlock();
3586 ContinuationBlock =
3587 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3588 InsertBlock->getTerminator()->eraseFromParent();
3589 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3590 }
3591
3592 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3593 AttributeList FuncAttrs;
3594 AttrBuilder AttrBldr(Ctx);
3595 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3596 AttrBldr.addAttribute(Attr);
3597 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3598 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3599
3600 CodeGenIP = Builder.saveIP();
3601 Expected<Function *> ReductionResult =
3602 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3603 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3604 if (!ReductionResult)
3605 return ReductionResult.takeError();
3606 Function *ReductionFunc = *ReductionResult;
3607 Builder.restoreIP(CodeGenIP);
3608
3609 // Set the grid value in the config needed for lowering later on
3610 if (GridValue.has_value())
3611 Config.setGridValue(GridValue.value());
3612 else
3613 Config.setGridValue(getGridValue(T, ReductionFunc));
3614
3615 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3616 // RedList, shuffle_reduce_func, interwarp_copy_func);
3617 // or
3618 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3619 Value *Res;
3620
3621 // 1. Build a list of reduction variables.
3622 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3623 auto Size = ReductionInfos.size();
3624 Type *PtrTy = PointerType::getUnqual(Ctx);
3625 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3626 CodeGenIP = Builder.saveIP();
3627 Builder.restoreIP(AllocaIP);
3628 Value *ReductionListAlloca =
3629 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3630 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3631 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3632 Builder.restoreIP(CodeGenIP);
3633 Type *IndexTy = Builder.getIndexTy(
3634 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3635 for (auto En : enumerate(ReductionInfos)) {
3636 const ReductionInfo &RI = En.value();
3637 Value *ElemPtr = Builder.CreateInBoundsGEP(
3638 RedArrayTy, ReductionList,
3639 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3640 Value *CastElem =
3641 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3642 Builder.CreateStore(CastElem, ElemPtr);
3643 }
3644 CodeGenIP = Builder.saveIP();
3645 Function *SarFunc =
3646 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3647 Expected<Function *> CopyResult =
3648 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3649 if (!CopyResult)
3650 return CopyResult.takeError();
3651 Function *WcFunc = *CopyResult;
3652 Builder.restoreIP(CodeGenIP);
3653
3654 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3655
3656 unsigned MaxDataSize = 0;
3657 SmallVector<Type *> ReductionTypeArgs;
3658 for (auto En : enumerate(ReductionInfos)) {
3659 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3660 if (Size > MaxDataSize)
3661 MaxDataSize = Size;
3662 ReductionTypeArgs.emplace_back(En.value().ElementType);
3663 }
3664 Value *ReductionDataSize =
3665 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3666 if (!IsTeamsReduction) {
3667 Value *SarFuncCast =
3668 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
3669 Value *WcFuncCast =
3670 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
3671 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3672 WcFuncCast};
3673 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3674 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3675 Res = Builder.CreateCall(Pv2Ptr, Args);
3676 } else {
3677 CodeGenIP = Builder.saveIP();
3678 StructType *ReductionsBufferTy = StructType::create(
3679 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3680 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3681 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3682 Function *LtGCFunc = emitListToGlobalCopyFunction(
3683 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3684 Function *LtGRFunc = emitListToGlobalReduceFunction(
3685 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3686 Function *GtLCFunc = emitGlobalToListCopyFunction(
3687 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3688 Function *GtLRFunc = emitGlobalToListReduceFunction(
3689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3690 Builder.restoreIP(CodeGenIP);
3691
3692 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3693 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3694
3695 Value *Args3[] = {SrcLocInfo,
3696 KernelTeamsReductionPtr,
3697 Builder.getInt32(ReductionBufNum),
3698 ReductionDataSize,
3699 RL,
3700 SarFunc,
3701 WcFunc,
3702 LtGCFunc,
3703 LtGRFunc,
3704 GtLCFunc,
3705 GtLRFunc};
3706
3707 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3708 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3709 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3710 }
3711
3712 // 5. Build if (res == 1)
3713 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3714 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3715 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3716 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3717
3718 // 6. Build then branch: where we have reduced values in the master
3719 // thread in each team.
3720 // __kmpc_end_reduce{_nowait}(<gtid>);
3721 // break;
3722 emitBlock(ThenBB, CurFunc);
3723
3724 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3725 for (auto En : enumerate(ReductionInfos)) {
3726 const ReductionInfo &RI = En.value();
3727 Value *LHS = RI.Variable;
3728 Value *RHS =
3729 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3730
3731 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3732 Value *LHSPtr, *RHSPtr;
3733 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3734 &LHSPtr, &RHSPtr, CurFunc));
3735
3736 // Fix the CallBack code genereated to use the correct Values for the LHS
3737 // and RHS
3738 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3739 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3740 ReductionFunc;
3741 });
3742 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3743 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3744 ReductionFunc;
3745 });
3746 } else {
3747 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3748 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3749 Value *Reduced;
3750 InsertPointOrErrorTy AfterIP =
3751 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3752 if (!AfterIP)
3753 return AfterIP.takeError();
3754 Builder.CreateStore(Reduced, LHS, false);
3755 }
3756 }
3757 emitBlock(ExitBB, CurFunc);
3758 if (ContinuationBlock) {
3759 Builder.CreateBr(ContinuationBlock);
3760 Builder.SetInsertPoint(ContinuationBlock);
3761 }
3762 Config.setEmitLLVMUsed();
3763
3764 return Builder.saveIP();
3765}
3766
3768 Type *VoidTy = Type::getVoidTy(M.getContext());
3769 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3770 auto *FuncTy =
3771 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3773 ".omp.reduction.func", &M);
3774}
3775
3777 Function *ReductionFunc,
3779 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3780 Module *Module = ReductionFunc->getParent();
3781 BasicBlock *ReductionFuncBlock =
3782 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3783 Builder.SetInsertPoint(ReductionFuncBlock);
3784 Value *LHSArrayPtr = nullptr;
3785 Value *RHSArrayPtr = nullptr;
3786 if (IsGPU) {
3787 // Need to alloca memory here and deal with the pointers before getting
3788 // LHS/RHS pointers out
3789 //
3790 Argument *Arg0 = ReductionFunc->getArg(0);
3791 Argument *Arg1 = ReductionFunc->getArg(1);
3792 Type *Arg0Type = Arg0->getType();
3793 Type *Arg1Type = Arg1->getType();
3794
3795 Value *LHSAlloca =
3796 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3797 Value *RHSAlloca =
3798 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3799 Value *LHSAddrCast =
3800 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3801 Value *RHSAddrCast =
3802 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3803 Builder.CreateStore(Arg0, LHSAddrCast);
3804 Builder.CreateStore(Arg1, RHSAddrCast);
3805 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3806 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3807 } else {
3808 LHSArrayPtr = ReductionFunc->getArg(0);
3809 RHSArrayPtr = ReductionFunc->getArg(1);
3810 }
3811
3812 unsigned NumReductions = ReductionInfos.size();
3813 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3814
3815 for (auto En : enumerate(ReductionInfos)) {
3816 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3817 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3818 RedArrayTy, LHSArrayPtr, 0, En.index());
3819 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3820 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3821 LHSI8Ptr, RI.Variable->getType());
3822 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3823 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3824 RedArrayTy, RHSArrayPtr, 0, En.index());
3825 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3826 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3827 RHSI8Ptr, RI.PrivateVariable->getType());
3828 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3829 Value *Reduced;
3830 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3831 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3832 if (!AfterIP)
3833 return AfterIP.takeError();
3834
3835 Builder.restoreIP(*AfterIP);
3836 // TODO: Consider flagging an error.
3837 if (!Builder.GetInsertBlock())
3838 return Error::success();
3839
3840 // store is inside of the reduction region when using by-ref
3841 if (!IsByRef[En.index()])
3842 Builder.CreateStore(Reduced, LHSPtr);
3843 }
3844 Builder.CreateRetVoid();
3845 return Error::success();
3846}
3847
3848OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3849 const LocationDescription &Loc, InsertPointTy AllocaIP,
3850 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3851 bool IsNoWait, bool IsTeamsReduction) {
3852 assert(ReductionInfos.size() == IsByRef.size());
3853 if (Config.isGPU())
3854 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3855 IsNoWait, IsTeamsReduction);
3856
3857 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3858
3859 if (!updateToLocation(Loc))
3860 return InsertPointTy();
3861
3862 if (ReductionInfos.size() == 0)
3863 return Builder.saveIP();
3864
3865 BasicBlock *InsertBlock = Loc.IP.getBlock();
3866 BasicBlock *ContinuationBlock =
3867 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3868 InsertBlock->getTerminator()->eraseFromParent();
3869
3870 // Create and populate array of type-erased pointers to private reduction
3871 // values.
3872 unsigned NumReductions = ReductionInfos.size();
3873 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3874 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3875 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3876
3877 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3878
3879 for (auto En : enumerate(ReductionInfos)) {
3880 unsigned Index = En.index();
3881 const ReductionInfo &RI = En.value();
3882 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3883 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3884 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3885 }
3886
3887 // Emit a call to the runtime function that orchestrates the reduction.
3888 // Declare the reduction function in the process.
3889 Type *IndexTy = Builder.getIndexTy(
3890 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3891 Function *Func = Builder.GetInsertBlock()->getParent();
3892 Module *Module = Func->getParent();
3893 uint32_t SrcLocStrSize;
3894 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3895 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3896 return RI.AtomicReductionGen;
3897 });
3898 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3899 CanGenerateAtomic
3900 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3901 : IdentFlag(0));
3902 Value *ThreadId = getOrCreateThreadID(Ident);
3903 Constant *NumVariables = Builder.getInt32(NumReductions);
3904 const DataLayout &DL = Module->getDataLayout();
3905 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3906 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3907 Function *ReductionFunc = getFreshReductionFunc(*Module);
3908 Value *Lock = getOMPCriticalRegionLock(".reduction");
3909 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3910 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3911 : RuntimeFunction::OMPRTL___kmpc_reduce);
3912 CallInst *ReduceCall =
3913 Builder.CreateCall(ReduceFunc,
3914 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3915 ReductionFunc, Lock},
3916 "reduce");
3917
3918 // Create final reduction entry blocks for the atomic and non-atomic case.
3919 // Emit IR that dispatches control flow to one of the blocks based on the
3920 // reduction supporting the atomic mode.
3921 BasicBlock *NonAtomicRedBlock =
3922 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3923 BasicBlock *AtomicRedBlock =
3924 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3926 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3927 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3928 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3929
3930 // Populate the non-atomic reduction using the elementwise reduction function.
3931 // This loads the elements from the global and private variables and reduces
3932 // them before storing back the result to the global variable.
3933 Builder.SetInsertPoint(NonAtomicRedBlock);
3934 for (auto En : enumerate(ReductionInfos)) {
3935 const ReductionInfo &RI = En.value();
3936 Type *ValueType = RI.ElementType;
3937 // We have one less load for by-ref case because that load is now inside of
3938 // the reduction region
3939 Value *RedValue = RI.Variable;
3940 if (!IsByRef[En.index()]) {
3941 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3942 "red.value." + Twine(En.index()));
3943 }
3944 Value *PrivateRedValue =
3945 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3946 "red.private.value." + Twine(En.index()));
3947 Value *Reduced;
3948 InsertPointOrErrorTy AfterIP =
3949 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3950 if (!AfterIP)
3951 return AfterIP.takeError();
3952 Builder.restoreIP(*AfterIP);
3953
3954 if (!Builder.GetInsertBlock())
3955 return InsertPointTy();
3956 // for by-ref case, the load is inside of the reduction region
3957 if (!IsByRef[En.index()])
3958 Builder.CreateStore(Reduced, RI.Variable);
3959 }
3960 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3961 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3962 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3963 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3964 Builder.CreateBr(ContinuationBlock);
3965
3966 // Populate the atomic reduction using the atomic elementwise reduction
3967 // function. There are no loads/stores here because they will be happening
3968 // inside the atomic elementwise reduction.
3969 Builder.SetInsertPoint(AtomicRedBlock);
3970 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3971 for (const ReductionInfo &RI : ReductionInfos) {
3972 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3973 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3974 if (!AfterIP)
3975 return AfterIP.takeError();
3976 Builder.restoreIP(*AfterIP);
3977 if (!Builder.GetInsertBlock())
3978 return InsertPointTy();
3979 }
3980 Builder.CreateBr(ContinuationBlock);
3981 } else {
3982 Builder.CreateUnreachable();
3983 }
3984
3985 // Populate the outlined reduction function using the elementwise reduction
3986 // function. Partial values are extracted from the type-erased array of
3987 // pointers to private variables.
3988 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3989 IsByRef, /*isGPU=*/false);
3990 if (Err)
3991 return Err;
3992
3993 if (!Builder.GetInsertBlock())
3994 return InsertPointTy();
3995
3996 Builder.SetInsertPoint(ContinuationBlock);
3997 return Builder.saveIP();
3998}
3999
4000OpenMPIRBuilder::InsertPointOrErrorTy
4001OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4002 BodyGenCallbackTy BodyGenCB,
4003 FinalizeCallbackTy FiniCB) {
4004 if (!updateToLocation(Loc))
4005 return Loc.IP;
4006
4007 Directive OMPD = Directive::OMPD_master;
4008 uint32_t SrcLocStrSize;
4009 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4010 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4011 Value *ThreadId = getOrCreateThreadID(Ident);
4012 Value *Args[] = {Ident, ThreadId};
4013
4014 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4015 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4016
4017 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4018 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4019
4020 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4021 /*Conditional*/ true, /*hasFinalize*/ true);
4022}
4023
4024OpenMPIRBuilder::InsertPointOrErrorTy
4025OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4026 BodyGenCallbackTy BodyGenCB,
4027 FinalizeCallbackTy FiniCB, Value *Filter) {
4028 if (!updateToLocation(Loc))
4029 return Loc.IP;
4030
4031 Directive OMPD = Directive::OMPD_masked;
4032 uint32_t SrcLocStrSize;
4033 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4034 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4035 Value *ThreadId = getOrCreateThreadID(Ident);
4036 Value *Args[] = {Ident, ThreadId, Filter};
4037 Value *ArgsEnd[] = {Ident, ThreadId};
4038
4039 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4040 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4041
4042 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4043 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4044
4045 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4046 /*Conditional*/ true, /*hasFinalize*/ true);
4047}
4048
4050 llvm::FunctionCallee Callee,
4052 const llvm::Twine &Name) {
4053 llvm::CallInst *Call = Builder.CreateCall(
4054 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4055 Call->setDoesNotThrow();
4056 return Call;
4057}
4058
4059// Expects input basic block is dominated by BeforeScanBB.
4060// Once Scan directive is encountered, the code after scan directive should be
4061// dominated by AfterScanBB. Scan directive splits the code sequence to
4062// scan and input phase. Based on whether inclusive or exclusive
4063// clause is used in the scan directive and whether input loop or scan loop
4064// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4065// input loop and second is the scan loop. The code generated handles only
4066// inclusive scans now.
4067OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4068 const LocationDescription &Loc, InsertPointTy AllocaIP,
4069 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4070 bool IsInclusive, ScanInfo *ScanRedInfo) {
4071 if (ScanRedInfo->OMPFirstScanLoop) {
4072 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4073 ScanVarsType, ScanRedInfo);
4074 if (Err)
4075 return Err;
4076 }
4077 if (!updateToLocation(Loc))
4078 return Loc.IP;
4079
4080 llvm::Value *IV = ScanRedInfo->IV;
4081
4082 if (ScanRedInfo->OMPFirstScanLoop) {
4083 // Emit buffer[i] = red; at the end of the input phase.
4084 for (size_t i = 0; i < ScanVars.size(); i++) {
4085 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4086 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4087 Type *DestTy = ScanVarsType[i];
4088 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4089 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4090
4091 Builder.CreateStore(Src, Val);
4092 }
4093 }
4094 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4095 emitBlock(ScanRedInfo->OMPScanDispatch,
4096 Builder.GetInsertBlock()->getParent());
4097
4098 if (!ScanRedInfo->OMPFirstScanLoop) {
4099 IV = ScanRedInfo->IV;
4100 // Emit red = buffer[i]; at the entrance to the scan phase.
4101 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4102 for (size_t i = 0; i < ScanVars.size(); i++) {
4103 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4104 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4105 Type *DestTy = ScanVarsType[i];
4106 Value *SrcPtr =
4107 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4108 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4109 Builder.CreateStore(Src, ScanVars[i]);
4110 }
4111 }
4112
4113 // TODO: Update it to CreateBr and remove dead blocks
4114 llvm::Value *CmpI = Builder.getInt1(true);
4115 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4116 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4117 ScanRedInfo->OMPAfterScanBlock);
4118 } else {
4119 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4120 ScanRedInfo->OMPBeforeScanBlock);
4121 }
4122 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4123 Builder.GetInsertBlock()->getParent());
4124 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4125 return Builder.saveIP();
4126}
4127
4128Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4129 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4130 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4131
4132 Builder.restoreIP(AllocaIP);
4133 // Create the shared pointer at alloca IP.
4134 for (size_t i = 0; i < ScanVars.size(); i++) {
4135 llvm::Value *BuffPtr =
4136 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4137 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4138 }
4139
4140 // Allocate temporary buffer by master thread
4141 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4142 InsertPointTy CodeGenIP) -> Error {
4143 Builder.restoreIP(CodeGenIP);
4144 Value *AllocSpan =
4145 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4146 for (size_t i = 0; i < ScanVars.size(); i++) {
4147 Type *IntPtrTy = Builder.getInt32Ty();
4148 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4149 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4150 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4151 AllocSpan, nullptr, "arr");
4152 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4153 }
4154 return Error::success();
4155 };
4156 // TODO: Perform finalization actions for variables. This has to be
4157 // called for variables which have destructors/finalizers.
4158 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4159
4160 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4161 llvm::Value *FilterVal = Builder.getInt32(0);
4162 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4163 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4164
4165 if (!AfterIP)
4166 return AfterIP.takeError();
4167 Builder.restoreIP(*AfterIP);
4168 BasicBlock *InputBB = Builder.GetInsertBlock();
4169 if (InputBB->getTerminator())
4170 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4171 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4172 if (!AfterIP)
4173 return AfterIP.takeError();
4174 Builder.restoreIP(*AfterIP);
4175
4176 return Error::success();
4177}
4178
4179Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4180 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4181 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4182 InsertPointTy CodeGenIP) -> Error {
4183 Builder.restoreIP(CodeGenIP);
4184 for (ReductionInfo RedInfo : ReductionInfos) {
4185 Value *PrivateVar = RedInfo.PrivateVariable;
4186 Value *OrigVar = RedInfo.Variable;
4187 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4188 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4189
4190 Type *SrcTy = RedInfo.ElementType;
4191 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4192 "arrayOffset");
4193 Value *Src = Builder.CreateLoad(SrcTy, Val);
4194
4195 Builder.CreateStore(Src, OrigVar);
4196 Builder.CreateFree(Buff);
4197 }
4198 return Error::success();
4199 };
4200 // TODO: Perform finalization actions for variables. This has to be
4201 // called for variables which have destructors/finalizers.
4202 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4203
4204 if (ScanRedInfo->OMPScanFinish->getTerminator())
4205 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4206 else
4207 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4208
4209 llvm::Value *FilterVal = Builder.getInt32(0);
4210 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4211 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4212
4213 if (!AfterIP)
4214 return AfterIP.takeError();
4215 Builder.restoreIP(*AfterIP);
4216 BasicBlock *InputBB = Builder.GetInsertBlock();
4217 if (InputBB->getTerminator())
4218 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4219 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4220 if (!AfterIP)
4221 return AfterIP.takeError();
4222 Builder.restoreIP(*AfterIP);
4223 return Error::success();
4224}
4225
4226OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4227 const LocationDescription &Loc,
4229 ScanInfo *ScanRedInfo) {
4230
4231 if (!updateToLocation(Loc))
4232 return Loc.IP;
4233 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4234 InsertPointTy CodeGenIP) -> Error {
4235 Builder.restoreIP(CodeGenIP);
4236 Function *CurFn = Builder.GetInsertBlock()->getParent();
4237 // for (int k = 0; k <= ceil(log2(n)); ++k)
4238 llvm::BasicBlock *LoopBB =
4239 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4240 llvm::BasicBlock *ExitBB =
4241 splitBB(Builder, false, "omp.outer.log.scan.exit");
4243 Builder.GetInsertBlock()->getModule(),
4244 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4245 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4246 llvm::Value *Arg =
4247 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4248 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4250 Builder.GetInsertBlock()->getModule(),
4251 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4252 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4253 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4254 llvm::Value *NMin1 = Builder.CreateNUWSub(
4255 ScanRedInfo->Span,
4256 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4257 Builder.SetInsertPoint(InputBB);
4258 Builder.CreateBr(LoopBB);
4259 emitBlock(LoopBB, CurFn);
4260 Builder.SetInsertPoint(LoopBB);
4261
4262 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4263 // size pow2k = 1;
4264 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4265 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4266 InputBB);
4267 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4268 InputBB);
4269 // for (size i = n - 1; i >= 2 ^ k; --i)
4270 // tmp[i] op= tmp[i-pow2k];
4271 llvm::BasicBlock *InnerLoopBB =
4272 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4273 llvm::BasicBlock *InnerExitBB =
4274 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4275 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4276 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4277 emitBlock(InnerLoopBB, CurFn);
4278 Builder.SetInsertPoint(InnerLoopBB);
4279 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4280 IVal->addIncoming(NMin1, LoopBB);
4281 for (ReductionInfo RedInfo : ReductionInfos) {
4282 Value *ReductionVal = RedInfo.PrivateVariable;
4283 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4284 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4285 Type *DestTy = RedInfo.ElementType;
4286 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4287 Value *LHSPtr =
4288 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4289 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4290 Value *RHSPtr =
4291 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4292 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4293 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4295 InsertPointOrErrorTy AfterIP =
4296 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4297 if (!AfterIP)
4298 return AfterIP.takeError();
4299 Builder.CreateStore(Result, LHSPtr);
4300 }
4301 llvm::Value *NextIVal = Builder.CreateNUWSub(
4302 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4303 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4304 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4305 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4306 emitBlock(InnerExitBB, CurFn);
4307 llvm::Value *Next = Builder.CreateNUWAdd(
4308 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4309 Counter->addIncoming(Next, Builder.GetInsertBlock());
4310 // pow2k <<= 1;
4311 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4312 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4313 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4314 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4315 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4316 return Error::success();
4317 };
4318
4319 // TODO: Perform finalization actions for variables. This has to be
4320 // called for variables which have destructors/finalizers.
4321 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4322
4323 llvm::Value *FilterVal = Builder.getInt32(0);
4324 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4325 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4326
4327 if (!AfterIP)
4328 return AfterIP.takeError();
4329 Builder.restoreIP(*AfterIP);
4330 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4331
4332 if (!AfterIP)
4333 return AfterIP.takeError();
4334 Builder.restoreIP(*AfterIP);
4335 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4336 if (Err)
4337 return Err;
4338
4339 return AfterIP;
4340}
4341
4342Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4343 llvm::function_ref<Error()> InputLoopGen,
4344 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4345 ScanInfo *ScanRedInfo) {
4346
4347 {
4348 // Emit loop with input phase:
4349 // for (i: 0..<num_iters>) {
4350 // <input phase>;
4351 // buffer[i] = red;
4352 // }
4353 ScanRedInfo->OMPFirstScanLoop = true;
4354 Error Err = InputLoopGen();
4355 if (Err)
4356 return Err;
4357 }
4358 {
4359 // Emit loop with scan phase:
4360 // for (i: 0..<num_iters>) {
4361 // red = buffer[i];
4362 // <scan phase>;
4363 // }
4364 ScanRedInfo->OMPFirstScanLoop = false;
4365 Error Err = ScanLoopGen(Builder.saveIP());
4366 if (Err)
4367 return Err;
4368 }
4369 return Error::success();
4370}
4371
4372void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4373 Function *Fun = Builder.GetInsertBlock()->getParent();
4374 ScanRedInfo->OMPScanDispatch =
4375 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4376 ScanRedInfo->OMPAfterScanBlock =
4377 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4378 ScanRedInfo->OMPBeforeScanBlock =
4379 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4380 ScanRedInfo->OMPScanLoopExit =
4381 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4382}
4383CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4384 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4385 BasicBlock *PostInsertBefore, const Twine &Name) {
4386 Module *M = F->getParent();
4387 LLVMContext &Ctx = M->getContext();
4388 Type *IndVarTy = TripCount->getType();
4389
4390 // Create the basic block structure.
4391 BasicBlock *Preheader =
4392 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4393 BasicBlock *Header =
4394 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4395 BasicBlock *Cond =
4396 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4397 BasicBlock *Body =
4398 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4399 BasicBlock *Latch =
4400 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4401 BasicBlock *Exit =
4402 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4403 BasicBlock *After =
4404 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4405
4406 // Use specified DebugLoc for new instructions.
4407 Builder.SetCurrentDebugLocation(DL);
4408
4409 Builder.SetInsertPoint(Preheader);
4410 Builder.CreateBr(Header);
4411
4412 Builder.SetInsertPoint(Header);
4413 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4414 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4415 Builder.CreateBr(Cond);
4416
4417 Builder.SetInsertPoint(Cond);
4418 Value *Cmp =
4419 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4420 Builder.CreateCondBr(Cmp, Body, Exit);
4421
4422 Builder.SetInsertPoint(Body);
4423 Builder.CreateBr(Latch);
4424
4425 Builder.SetInsertPoint(Latch);
4426 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4427 "omp_" + Name + ".next", /*HasNUW=*/true);
4428 Builder.CreateBr(Header);
4429 IndVarPHI->addIncoming(Next, Latch);
4430
4431 Builder.SetInsertPoint(Exit);
4432 Builder.CreateBr(After);
4433
4434 // Remember and return the canonical control flow.
4435 LoopInfos.emplace_front();
4436 CanonicalLoopInfo *CL = &LoopInfos.front();
4437
4438 CL->Header = Header;
4439 CL->Cond = Cond;
4440 CL->Latch = Latch;
4441 CL->Exit = Exit;
4442
4443#ifndef NDEBUG
4444 CL->assertOK();
4445#endif
4446 return CL;
4447}
4448
4450OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4451 LoopBodyGenCallbackTy BodyGenCB,
4452 Value *TripCount, const Twine &Name) {
4453 BasicBlock *BB = Loc.IP.getBlock();
4454 BasicBlock *NextBB = BB->getNextNode();
4455
4456 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4457 NextBB, NextBB, Name);
4458 BasicBlock *After = CL->getAfter();
4459
4460 // If location is not set, don't connect the loop.
4461 if (updateToLocation(Loc)) {
4462 // Split the loop at the insertion point: Branch to the preheader and move
4463 // every following instruction to after the loop (the After BB). Also, the
4464 // new successor is the loop's after block.
4465 spliceBB(Builder, After, /*CreateBranch=*/false);
4466 Builder.CreateBr(CL->getPreheader());
4467 }
4468
4469 // Emit the body content. We do it after connecting the loop to the CFG to
4470 // avoid that the callback encounters degenerate BBs.
4471 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4472 return Err;
4473
4474#ifndef NDEBUG
4475 CL->assertOK();
4476#endif
4477 return CL;
4478}
4479
4480Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4481 ScanInfos.emplace_front();
4482 ScanInfo *Result = &ScanInfos.front();
4483 return Result;
4484}
4485
4487OpenMPIRBuilder::createCanonicalScanLoops(
4488 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4489 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4490 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4491 LocationDescription ComputeLoc =
4492 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4493 updateToLocation(ComputeLoc);
4494
4496
4497 Value *TripCount = calculateCanonicalLoopTripCount(
4498 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4499 ScanRedInfo->Span = TripCount;
4500 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4501 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4502
4503 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4504 Builder.restoreIP(CodeGenIP);
4505 ScanRedInfo->IV = IV;
4506 createScanBBs(ScanRedInfo);
4507 BasicBlock *InputBlock = Builder.GetInsertBlock();
4508 Instruction *Terminator = InputBlock->getTerminator();
4509 assert(Terminator->getNumSuccessors() == 1);
4510 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4511 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4512 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4513 Builder.GetInsertBlock()->getParent());
4514 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4515 emitBlock(ScanRedInfo->OMPScanLoopExit,
4516 Builder.GetInsertBlock()->getParent());
4517 Builder.CreateBr(ContinueBlock);
4518 Builder.SetInsertPoint(
4519 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4520 return BodyGenCB(Builder.saveIP(), IV);
4521 };
4522
4523 const auto &&InputLoopGen = [&]() -> Error {
4524 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4525 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4526 ComputeIP, Name, true, ScanRedInfo);
4527 if (!LoopInfo)
4528 return LoopInfo.takeError();
4529 Result.push_back(*LoopInfo);
4530 Builder.restoreIP((*LoopInfo)->getAfterIP());
4531 return Error::success();
4532 };
4533 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4535 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4536 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4537 if (!LoopInfo)
4538 return LoopInfo.takeError();
4539 Result.push_back(*LoopInfo);
4540 Builder.restoreIP((*LoopInfo)->getAfterIP());
4541 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4542 return Error::success();
4543 };
4544 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4545 if (Err)
4546 return Err;
4547 return Result;
4548}
4549
4550Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4551 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4552 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4553
4554 // Consider the following difficulties (assuming 8-bit signed integers):
4555 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4556 // DO I = 1, 100, 50
4557 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4558 // DO I = 100, 0, -128
4559
4560 // Start, Stop and Step must be of the same integer type.
4561 auto *IndVarTy = cast<IntegerType>(Start->getType());
4562 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4563 assert(IndVarTy == Step->getType() && "Step type mismatch");
4564
4565 updateToLocation(Loc);
4566
4567 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4568 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4569
4570 // Like Step, but always positive.
4571 Value *Incr = Step;
4572
4573 // Distance between Start and Stop; always positive.
4574 Value *Span;
4575
4576 // Condition whether there are no iterations are executed at all, e.g. because
4577 // UB < LB.
4578 Value *ZeroCmp;
4579
4580 if (IsSigned) {
4581 // Ensure that increment is positive. If not, negate and invert LB and UB.
4582 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4583 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4584 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4585 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4586 Span = Builder.CreateSub(UB, LB, "", false, true);
4587 ZeroCmp = Builder.CreateICmp(
4588 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4589 } else {
4590 Span = Builder.CreateSub(Stop, Start, "", true);
4591 ZeroCmp = Builder.CreateICmp(
4592 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4593 }
4594
4595 Value *CountIfLooping;
4596 if (InclusiveStop) {
4597 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4598 } else {
4599 // Avoid incrementing past stop since it could overflow.
4600 Value *CountIfTwo = Builder.CreateAdd(
4601 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4602 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4603 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4604 }
4605
4606 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4607 "omp_" + Name + ".tripcount");
4608}
4609
4610Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4611 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4612 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4613 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4614 ScanInfo *ScanRedInfo) {
4615 LocationDescription ComputeLoc =
4616 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4617
4618 Value *TripCount = calculateCanonicalLoopTripCount(
4619 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4620
4621 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4622 Builder.restoreIP(CodeGenIP);
4623 Value *Span = Builder.CreateMul(IV, Step);
4624 Value *IndVar = Builder.CreateAdd(Span, Start);
4625 if (InScan)
4626 ScanRedInfo->IV = IndVar;
4627 return BodyGenCB(Builder.saveIP(), IndVar);
4628 };
4629 LocationDescription LoopLoc =
4630 ComputeIP.isSet()
4631 ? Loc
4632 : LocationDescription(Builder.saveIP(),
4633 Builder.getCurrentDebugLocation());
4634 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4635}
4636
4637// Returns an LLVM function to call for initializing loop bounds using OpenMP
4638// static scheduling for composite `distribute parallel for` depending on
4639// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4640// integers as unsigned similarly to CanonicalLoopInfo.
4641static FunctionCallee
4643 OpenMPIRBuilder &OMPBuilder) {
4644 unsigned Bitwidth = Ty->getIntegerBitWidth();
4645 if (Bitwidth == 32)
4646 return OMPBuilder.getOrCreateRuntimeFunction(
4647 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4648 if (Bitwidth == 64)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4651 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4652}
4653
4654// Returns an LLVM function to call for initializing loop bounds using OpenMP
4655// static scheduling depending on `type`. Only i32 and i64 are supported by the
4656// runtime. Always interpret integers as unsigned similarly to
4657// CanonicalLoopInfo.
4659 OpenMPIRBuilder &OMPBuilder) {
4660 unsigned Bitwidth = Ty->getIntegerBitWidth();
4661 if (Bitwidth == 32)
4662 return OMPBuilder.getOrCreateRuntimeFunction(
4663 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4664 if (Bitwidth == 64)
4665 return OMPBuilder.getOrCreateRuntimeFunction(
4666 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4667 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4668}
4669
4670OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4671 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4672 WorksharingLoopType LoopType, bool NeedsBarrier) {
4673 assert(CLI->isValid() && "Requires a valid canonical loop");
4674 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4675 "Require dedicated allocate IP");
4676
4677 // Set up the source location value for OpenMP runtime.
4678 Builder.restoreIP(CLI->getPreheaderIP());
4679 Builder.SetCurrentDebugLocation(DL);
4680
4681 uint32_t SrcLocStrSize;
4682 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4683 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4684
4685 // Declare useful OpenMP runtime functions.
4686 Value *IV = CLI->getIndVar();
4687 Type *IVTy = IV->getType();
4688 FunctionCallee StaticInit =
4689 LoopType == WorksharingLoopType::DistributeForStaticLoop
4690 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4691 : getKmpcForStaticInitForType(IVTy, M, *this);
4692 FunctionCallee StaticFini =
4693 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4694
4695 // Allocate space for computed loop bounds as expected by the "init" function.
4696 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4697
4698 Type *I32Type = Type::getInt32Ty(M.getContext());
4699 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4700 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4701 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4702 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4703 CLI->setLastIter(PLastIter);
4704
4705 // At the end of the preheader, prepare for calling the "init" function by
4706 // storing the current loop bounds into the allocated space. A canonical loop
4707 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4708 // and produces an inclusive upper bound.
4709 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4710 Constant *Zero = ConstantInt::get(IVTy, 0);
4711 Constant *One = ConstantInt::get(IVTy, 1);
4712 Builder.CreateStore(Zero, PLowerBound);
4713 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4714 Builder.CreateStore(UpperBound, PUpperBound);
4715 Builder.CreateStore(One, PStride);
4716
4717 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4718
4719 OMPScheduleType SchedType =
4720 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4721 ? OMPScheduleType::OrderedDistribute
4723 Constant *SchedulingType =
4724 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4725
4726 // Call the "init" function and update the trip count of the loop with the
4727 // value it produced.
4729 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4730 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4731 Value *PDistUpperBound =
4732 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4733 Args.push_back(PDistUpperBound);
4734 }
4735 Args.append({PStride, One, Zero});
4736 Builder.CreateCall(StaticInit, Args);
4737 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4738 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4739 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4740 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4741 CLI->setTripCount(TripCount);
4742
4743 // Update all uses of the induction variable except the one in the condition
4744 // block that compares it with the actual upper bound, and the increment in
4745 // the latch block.
4746
4747 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4748 Builder.SetInsertPoint(CLI->getBody(),
4749 CLI->getBody()->getFirstInsertionPt());
4750 Builder.SetCurrentDebugLocation(DL);
4751 return Builder.CreateAdd(OldIV, LowerBound);
4752 });
4753
4754 // In the "exit" block, call the "fini" function.
4755 Builder.SetInsertPoint(CLI->getExit(),
4756 CLI->getExit()->getTerminator()->getIterator());
4757 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4758
4759 // Add the barrier if requested.
4760 if (NeedsBarrier) {
4761 InsertPointOrErrorTy BarrierIP =
4762 createBarrier(LocationDescription(Builder.saveIP(), DL),
4763 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4764 /* CheckCancelFlag */ false);
4765 if (!BarrierIP)
4766 return BarrierIP.takeError();
4767 }
4768
4769 InsertPointTy AfterIP = CLI->getAfterIP();
4770 CLI->invalidate();
4771
4772 return AfterIP;
4773}
4774
4775OpenMPIRBuilder::InsertPointOrErrorTy
4776OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4777 CanonicalLoopInfo *CLI,
4778 InsertPointTy AllocaIP,
4779 bool NeedsBarrier,
4780 Value *ChunkSize) {
4781 assert(CLI->isValid() && "Requires a valid canonical loop");
4782 assert(ChunkSize && "Chunk size is required");
4783
4784 LLVMContext &Ctx = CLI->getFunction()->getContext();
4785 Value *IV = CLI->getIndVar();
4786 Value *OrigTripCount = CLI->getTripCount();
4787 Type *IVTy = IV->getType();
4788 assert(IVTy->getIntegerBitWidth() <= 64 &&
4789 "Max supported tripcount bitwidth is 64 bits");
4790 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4791 : Type::getInt64Ty(Ctx);
4792 Type *I32Type = Type::getInt32Ty(M.getContext());
4793 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4794 Constant *One = ConstantInt::get(InternalIVTy, 1);
4795
4796 // Declare useful OpenMP runtime functions.
4797 FunctionCallee StaticInit =
4798 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4799 FunctionCallee StaticFini =
4800 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4801
4802 // Allocate space for computed loop bounds as expected by the "init" function.
4803 Builder.restoreIP(AllocaIP);
4804 Builder.SetCurrentDebugLocation(DL);
4805 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4806 Value *PLowerBound =
4807 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4808 Value *PUpperBound =
4809 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4810 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4811 CLI->setLastIter(PLastIter);
4812
4813 // Set up the source location value for the OpenMP runtime.
4814 Builder.restoreIP(CLI->getPreheaderIP());
4815 Builder.SetCurrentDebugLocation(DL);
4816
4817 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4818 Value *CastedChunkSize =
4819 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4820 Value *CastedTripCount =
4821 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4822
4823 Constant *SchedulingType = ConstantInt::get(
4824 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4825 Builder.CreateStore(Zero, PLowerBound);
4826 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4827 Builder.CreateStore(OrigUpperBound, PUpperBound);
4828 Builder.CreateStore(One, PStride);
4829
4830 // Call the "init" function and update the trip count of the loop with the
4831 // value it produced.
4832 uint32_t SrcLocStrSize;
4833 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4834 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4835 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4836 Builder.CreateCall(StaticInit,
4837 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4838 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4839 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4840 /*pstride=*/PStride, /*incr=*/One,
4841 /*chunk=*/CastedChunkSize});
4842
4843 // Load values written by the "init" function.
4844 Value *FirstChunkStart =
4845 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4846 Value *FirstChunkStop =
4847 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4848 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4849 Value *ChunkRange =
4850 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4851 Value *NextChunkStride =
4852 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4853
4854 // Create outer "dispatch" loop for enumerating the chunks.
4855 BasicBlock *DispatchEnter = splitBB(Builder, true);
4856 Value *DispatchCounter;
4857
4858 // It is safe to assume this didn't return an error because the callback
4859 // passed into createCanonicalLoop is the only possible error source, and it
4860 // always returns success.
4861 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4862 {Builder.saveIP(), DL},
4863 [&](InsertPointTy BodyIP, Value *Counter) {
4864 DispatchCounter = Counter;
4865 return Error::success();
4866 },
4867 FirstChunkStart, CastedTripCount, NextChunkStride,
4868 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4869 "dispatch"));
4870
4871 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4872 // not have to preserve the canonical invariant.
4873 BasicBlock *DispatchBody = DispatchCLI->getBody();
4874 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4875 BasicBlock *DispatchExit = DispatchCLI->getExit();
4876 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4877 DispatchCLI->invalidate();
4878
4879 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4880 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4881 redirectTo(CLI->getExit(), DispatchLatch, DL);
4882 redirectTo(DispatchBody, DispatchEnter, DL);
4883
4884 // Prepare the prolog of the chunk loop.
4885 Builder.restoreIP(CLI->getPreheaderIP());
4886 Builder.SetCurrentDebugLocation(DL);
4887
4888 // Compute the number of iterations of the chunk loop.
4889 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4890 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4891 Value *IsLastChunk =
4892 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4893 Value *CountUntilOrigTripCount =
4894 Builder.CreateSub(CastedTripCount, DispatchCounter);
4895 Value *ChunkTripCount = Builder.CreateSelect(
4896 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4897 Value *BackcastedChunkTC =
4898 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4899 CLI->setTripCount(BackcastedChunkTC);
4900
4901 // Update all uses of the induction variable except the one in the condition
4902 // block that compares it with the actual upper bound, and the increment in
4903 // the latch block.
4904 Value *BackcastedDispatchCounter =
4905 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4906 CLI->mapIndVar([&](Instruction *) -> Value * {
4907 Builder.restoreIP(CLI->getBodyIP());
4908 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4909 });
4910
4911 // In the "exit" block, call the "fini" function.
4912 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4913 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4914
4915 // Add the barrier if requested.
4916 if (NeedsBarrier) {
4917 InsertPointOrErrorTy AfterIP =
4918 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4919 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4920 if (!AfterIP)
4921 return AfterIP.takeError();
4922 }
4923
4924#ifndef NDEBUG
4925 // Even though we currently do not support applying additional methods to it,
4926 // the chunk loop should remain a canonical loop.
4927 CLI->assertOK();
4928#endif
4929
4930 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4931}
4932
4933// Returns an LLVM function to call for executing an OpenMP static worksharing
4934// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4935// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4936static FunctionCallee
4937getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4938 WorksharingLoopType LoopType) {
4939 unsigned Bitwidth = Ty->getIntegerBitWidth();
4940 Module &M = OMPBuilder->M;
4941 switch (LoopType) {
4942 case WorksharingLoopType::ForStaticLoop:
4943 if (Bitwidth == 32)
4944 return OMPBuilder->getOrCreateRuntimeFunction(
4945 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4946 if (Bitwidth == 64)
4947 return OMPBuilder->getOrCreateRuntimeFunction(
4948 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4949 break;
4950 case WorksharingLoopType::DistributeStaticLoop:
4951 if (Bitwidth == 32)
4952 return OMPBuilder->getOrCreateRuntimeFunction(
4953 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4954 if (Bitwidth == 64)
4955 return OMPBuilder->getOrCreateRuntimeFunction(
4956 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4957 break;
4958 case WorksharingLoopType::DistributeForStaticLoop:
4959 if (Bitwidth == 32)
4960 return OMPBuilder->getOrCreateRuntimeFunction(
4961 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4962 if (Bitwidth == 64)
4963 return OMPBuilder->getOrCreateRuntimeFunction(
4964 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4965 break;
4966 }
4967 if (Bitwidth != 32 && Bitwidth != 64) {
4968 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4969 }
4970 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4971}
4972
4973// Inserts a call to proper OpenMP Device RTL function which handles
4974// loop worksharing.
4975static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4976 WorksharingLoopType LoopType,
4977 BasicBlock *InsertBlock, Value *Ident,
4978 Value *LoopBodyArg, Value *TripCount,
4979 Function &LoopBodyFn) {
4980 Type *TripCountTy = TripCount->getType();
4981 Module &M = OMPBuilder->M;
4982 IRBuilder<> &Builder = OMPBuilder->Builder;
4983 FunctionCallee RTLFn =
4984 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4985 SmallVector<Value *, 8> RealArgs;
4986 RealArgs.push_back(Ident);
4987 RealArgs.push_back(&LoopBodyFn);
4988 RealArgs.push_back(LoopBodyArg);
4989 RealArgs.push_back(TripCount);
4990 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4991 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4992 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4993 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4994 Builder.CreateCall(RTLFn, RealArgs);
4995 return;
4996 }
4997 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4998 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4999 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5000 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5001
5002 RealArgs.push_back(
5003 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5004 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5005 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5006 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5007 }
5008 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5009
5010 Builder.CreateCall(RTLFn, RealArgs);
5011}
5012
5014 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5015 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5016 WorksharingLoopType LoopType) {
5017 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5018 BasicBlock *Preheader = CLI->getPreheader();
5019 Value *TripCount = CLI->getTripCount();
5020
5021 // After loop body outling, the loop body contains only set up
5022 // of loop body argument structure and the call to the outlined
5023 // loop body function. Firstly, we need to move setup of loop body args
5024 // into loop preheader.
5025 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5026 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5027
5028 // The next step is to remove the whole loop. We do not it need anymore.
5029 // That's why make an unconditional branch from loop preheader to loop
5030 // exit block
5031 Builder.restoreIP({Preheader, Preheader->end()});
5032 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5033 Preheader->getTerminator()->eraseFromParent();
5034 Builder.CreateBr(CLI->getExit());
5035
5036 // Delete dead loop blocks
5037 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5038 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5039 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5040 CleanUpInfo.EntryBB = CLI->getHeader();
5041 CleanUpInfo.ExitBB = CLI->getExit();
5042 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5043 DeleteDeadBlocks(BlocksToBeRemoved);
5044
5045 // Find the instruction which corresponds to loop body argument structure
5046 // and remove the call to loop body function instruction.
5047 Value *LoopBodyArg;
5048 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5049 assert(OutlinedFnUser &&
5050 "Expected unique undroppable user of outlined function");
5051 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5052 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5053 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5054 "Expected outlined function call to be located in loop preheader");
5055 // Check in case no argument structure has been passed.
5056 if (OutlinedFnCallInstruction->arg_size() > 1)
5057 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5058 else
5059 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5060 OutlinedFnCallInstruction->eraseFromParent();
5061
5062 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5063 LoopBodyArg, TripCount, OutlinedFn);
5064
5065 for (auto &ToBeDeletedItem : ToBeDeleted)
5066 ToBeDeletedItem->eraseFromParent();
5067 CLI->invalidate();
5068}
5069
5070OpenMPIRBuilder::InsertPointTy
5071OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5072 InsertPointTy AllocaIP,
5073 WorksharingLoopType LoopType) {
5074 uint32_t SrcLocStrSize;
5075 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5076 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5077
5078 OutlineInfo OI;
5079 OI.OuterAllocaBB = CLI->getPreheader();
5080 Function *OuterFn = CLI->getPreheader()->getParent();
5081
5082 // Instructions which need to be deleted at the end of code generation
5084
5085 OI.OuterAllocaBB = AllocaIP.getBlock();
5086
5087 // Mark the body loop as region which needs to be extracted
5088 OI.EntryBB = CLI->getBody();
5089 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5090 "omp.prelatch", true);
5091
5092 // Prepare loop body for extraction
5093 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5094
5095 // Insert new loop counter variable which will be used only in loop
5096 // body.
5097 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5098 Instruction *NewLoopCntLoad =
5099 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5100 // New loop counter instructions are redundant in the loop preheader when
5101 // code generation for workshare loop is finshed. That's why mark them as
5102 // ready for deletion.
5103 ToBeDeleted.push_back(NewLoopCntLoad);
5104 ToBeDeleted.push_back(NewLoopCnt);
5105
5106 // Analyse loop body region. Find all input variables which are used inside
5107 // loop body region.
5108 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5110 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5111
5112 CodeExtractorAnalysisCache CEAC(*OuterFn);
5113 CodeExtractor Extractor(Blocks,
5114 /* DominatorTree */ nullptr,
5115 /* AggregateArgs */ true,
5116 /* BlockFrequencyInfo */ nullptr,
5117 /* BranchProbabilityInfo */ nullptr,
5118 /* AssumptionCache */ nullptr,
5119 /* AllowVarArgs */ true,
5120 /* AllowAlloca */ true,
5121 /* AllocationBlock */ CLI->getPreheader(),
5122 /* Suffix */ ".omp_wsloop",
5123 /* AggrArgsIn0AddrSpace */ true);
5124
5125 BasicBlock *CommonExit = nullptr;
5126 SetVector<Value *> SinkingCands, HoistingCands;
5127
5128 // Find allocas outside the loop body region which are used inside loop
5129 // body
5130 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5131
5132 // We need to model loop body region as the function f(cnt, loop_arg).
5133 // That's why we replace loop induction variable by the new counter
5134 // which will be one of loop body function argument
5135 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5136 CLI->getIndVar()->user_end());
5137 for (auto Use : Users) {
5138 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5139 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5140 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5141 }
5142 }
5143 }
5144 // Make sure that loop counter variable is not merged into loop body
5145 // function argument structure and it is passed as separate variable
5146 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5147
5148 // PostOutline CB is invoked when loop body function is outlined and
5149 // loop body is replaced by call to outlined function. We need to add
5150 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5151 // function will handle loop control logic.
5152 //
5153 OI.PostOutlineCB = [=, ToBeDeletedVec =
5154 std::move(ToBeDeleted)](Function &OutlinedFn) {
5155 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5156 LoopType);
5157 };
5158 addOutlineInfo(std::move(OI));
5159 return CLI->getAfterIP();
5160}
5161
5162OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5163 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5164 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5165 bool HasSimdModifier, bool HasMonotonicModifier,
5166 bool HasNonmonotonicModifier, bool HasOrderedClause,
5167 WorksharingLoopType LoopType) {
5168 if (Config.isTargetDevice())
5169 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5170 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5171 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5172 HasNonmonotonicModifier, HasOrderedClause);
5173
5174 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5175 OMPScheduleType::ModifierOrdered;
5176 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5177 case OMPScheduleType::BaseStatic:
5178 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5179 if (IsOrdered)
5180 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5181 NeedsBarrier, ChunkSize);
5182 // FIXME: Monotonicity ignored?
5183 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5184
5185 case OMPScheduleType::BaseStaticChunked:
5186 if (IsOrdered)
5187 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5188 NeedsBarrier, ChunkSize);
5189 // FIXME: Monotonicity ignored?
5190 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5191 ChunkSize);
5192
5193 case OMPScheduleType::BaseRuntime:
5194 case OMPScheduleType::BaseAuto:
5195 case OMPScheduleType::BaseGreedy:
5196 case OMPScheduleType::BaseBalanced:
5197 case OMPScheduleType::BaseSteal:
5198 case OMPScheduleType::BaseGuidedSimd:
5199 case OMPScheduleType::BaseRuntimeSimd:
5200 assert(!ChunkSize &&
5201 "schedule type does not support user-defined chunk sizes");
5202 [[fallthrough]];
5203 case OMPScheduleType::BaseDynamicChunked:
5204 case OMPScheduleType::BaseGuidedChunked:
5205 case OMPScheduleType::BaseGuidedIterativeChunked:
5206 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5207 case OMPScheduleType::BaseStaticBalancedChunked:
5208 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5209 NeedsBarrier, ChunkSize);
5210
5211 default:
5212 llvm_unreachable("Unknown/unimplemented schedule kind");
5213 }
5214}
5215
5216/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5217/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5218/// the runtime. Always interpret integers as unsigned similarly to
5219/// CanonicalLoopInfo.
5220static FunctionCallee
5221getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5222 unsigned Bitwidth = Ty->getIntegerBitWidth();
5223 if (Bitwidth == 32)
5224 return OMPBuilder.getOrCreateRuntimeFunction(
5225 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5226 if (Bitwidth == 64)
5227 return OMPBuilder.getOrCreateRuntimeFunction(
5228 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5229 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5230}
5231
5232/// Returns an LLVM function to call for updating the next loop using OpenMP
5233/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5234/// the runtime. Always interpret integers as unsigned similarly to
5235/// CanonicalLoopInfo.
5236static FunctionCallee
5237getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5238 unsigned Bitwidth = Ty->getIntegerBitWidth();
5239 if (Bitwidth == 32)
5240 return OMPBuilder.getOrCreateRuntimeFunction(
5241 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5242 if (Bitwidth == 64)
5243 return OMPBuilder.getOrCreateRuntimeFunction(
5244 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5245 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5246}
5247
5248/// Returns an LLVM function to call for finalizing the dynamic loop using
5249/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5250/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5251static FunctionCallee
5252getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5253 unsigned Bitwidth = Ty->getIntegerBitWidth();
5254 if (Bitwidth == 32)
5255 return OMPBuilder.getOrCreateRuntimeFunction(
5256 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5257 if (Bitwidth == 64)
5258 return OMPBuilder.getOrCreateRuntimeFunction(
5259 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5260 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5261}
5262
5263OpenMPIRBuilder::InsertPointOrErrorTy
5264OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5265 InsertPointTy AllocaIP,
5266 OMPScheduleType SchedType,
5267 bool NeedsBarrier, Value *Chunk) {
5268 assert(CLI->isValid() && "Requires a valid canonical loop");
5269 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5270 "Require dedicated allocate IP");
5272 "Require valid schedule type");
5273
5274 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5275 OMPScheduleType::ModifierOrdered;
5276
5277 // Set up the source location value for OpenMP runtime.
5278 Builder.SetCurrentDebugLocation(DL);
5279
5280 uint32_t SrcLocStrSize;
5281 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5282 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5283
5284 // Declare useful OpenMP runtime functions.
5285 Value *IV = CLI->getIndVar();
5286 Type *IVTy = IV->getType();
5287 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5288 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5289
5290 // Allocate space for computed loop bounds as expected by the "init" function.
5291 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5292 Type *I32Type = Type::getInt32Ty(M.getContext());
5293 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5294 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5295 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5296 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5297 CLI->setLastIter(PLastIter);
5298
5299 // At the end of the preheader, prepare for calling the "init" function by
5300 // storing the current loop bounds into the allocated space. A canonical loop
5301 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5302 // and produces an inclusive upper bound.
5303 BasicBlock *PreHeader = CLI->getPreheader();
5304 Builder.SetInsertPoint(PreHeader->getTerminator());
5305 Constant *One = ConstantInt::get(IVTy, 1);
5306 Builder.CreateStore(One, PLowerBound);
5307 Value *UpperBound = CLI->getTripCount();
5308 Builder.CreateStore(UpperBound, PUpperBound);
5309 Builder.CreateStore(One, PStride);
5310
5311 BasicBlock *Header = CLI->getHeader();
5312 BasicBlock *Exit = CLI->getExit();
5313 BasicBlock *Cond = CLI->getCond();
5314 BasicBlock *Latch = CLI->getLatch();
5315 InsertPointTy AfterIP = CLI->getAfterIP();
5316
5317 // The CLI will be "broken" in the code below, as the loop is no longer
5318 // a valid canonical loop.
5319
5320 if (!Chunk)
5321 Chunk = One;
5322
5323 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5324
5325 Constant *SchedulingType =
5326 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5327
5328 // Call the "init" function.
5329 Builder.CreateCall(DynamicInit,
5330 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5331 UpperBound, /* step */ One, Chunk});
5332
5333 // An outer loop around the existing one.
5334 BasicBlock *OuterCond = BasicBlock::Create(
5335 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5336 PreHeader->getParent());
5337 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5338 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5339 Value *Res =
5340 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5341 PLowerBound, PUpperBound, PStride});
5342 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5343 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5344 Value *LowerBound =
5345 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5346 Builder.CreateCondBr(MoreWork, Header, Exit);
5347
5348 // Change PHI-node in loop header to use outer cond rather than preheader,
5349 // and set IV to the LowerBound.
5350 Instruction *Phi = &Header->front();
5351 auto *PI = cast<PHINode>(Phi);
5352 PI->setIncomingBlock(0, OuterCond);
5353 PI->setIncomingValue(0, LowerBound);
5354
5355 // Then set the pre-header to jump to the OuterCond
5356 Instruction *Term = PreHeader->getTerminator();
5357 auto *Br = cast<BranchInst>(Term);
5358 Br->setSuccessor(0, OuterCond);
5359
5360 // Modify the inner condition:
5361 // * Use the UpperBound returned from the DynamicNext call.
5362 // * jump to the loop outer loop when done with one of the inner loops.
5363 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5364 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5365 Instruction *Comp = &*Builder.GetInsertPoint();
5366 auto *CI = cast<CmpInst>(Comp);
5367 CI->setOperand(1, UpperBound);
5368 // Redirect the inner exit to branch to outer condition.
5369 Instruction *Branch = &Cond->back();
5370 auto *BI = cast<BranchInst>(Branch);
5371 assert(BI->getSuccessor(1) == Exit);
5372 BI->setSuccessor(1, OuterCond);
5373
5374 // Call the "fini" function if "ordered" is present in wsloop directive.
5375 if (Ordered) {
5376 Builder.SetInsertPoint(&Latch->back());
5377 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5378 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5379 }
5380
5381 // Add the barrier if requested.
5382 if (NeedsBarrier) {
5383 Builder.SetInsertPoint(&Exit->back());
5384 InsertPointOrErrorTy BarrierIP =
5385 createBarrier(LocationDescription(Builder.saveIP(), DL),
5386 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5387 /* CheckCancelFlag */ false);
5388 if (!BarrierIP)
5389 return BarrierIP.takeError();
5390 }
5391
5392 CLI->invalidate();
5393 return AfterIP;
5394}
5395
5396/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5397/// after this \p OldTarget will be orphaned.
5399 BasicBlock *NewTarget, DebugLoc DL) {
5400 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5401 redirectTo(Pred, NewTarget, DL);
5402}
5403
5404/// Determine which blocks in \p BBs are reachable from outside and remove the
5405/// ones that are not reachable from the function.
5408 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5409 for (Use &U : BB->uses()) {
5410 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5411 if (!UseInst)
5412 continue;
5413 if (BBsToErase.count(UseInst->getParent()))
5414 continue;
5415 return true;
5416 }
5417 return false;
5418 };
5419
5420 while (BBsToErase.remove_if(HasRemainingUses)) {
5421 // Try again if anything was removed.
5422 }
5423
5424 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5425 DeleteDeadBlocks(BBVec);
5426}
5427
5428CanonicalLoopInfo *
5429OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5430 InsertPointTy ComputeIP) {
5431 assert(Loops.size() >= 1 && "At least one loop required");
5432 size_t NumLoops = Loops.size();
5433
5434 // Nothing to do if there is already just one loop.
5435 if (NumLoops == 1)
5436 return Loops.front();
5437
5438 CanonicalLoopInfo *Outermost = Loops.front();
5439 CanonicalLoopInfo *Innermost = Loops.back();
5440 BasicBlock *OrigPreheader = Outermost->getPreheader();
5441 BasicBlock *OrigAfter = Outermost->getAfter();
5442 Function *F = OrigPreheader->getParent();
5443
5444 // Loop control blocks that may become orphaned later.
5445 SmallVector<BasicBlock *, 12> OldControlBBs;
5446 OldControlBBs.reserve(6 * Loops.size());
5447 for (CanonicalLoopInfo *Loop : Loops)
5448 Loop->collectControlBlocks(OldControlBBs);
5449
5450 // Setup the IRBuilder for inserting the trip count computation.
5451 Builder.SetCurrentDebugLocation(DL);
5452 if (ComputeIP.isSet())
5453 Builder.restoreIP(ComputeIP);
5454 else
5455 Builder.restoreIP(Outermost->getPreheaderIP());
5456
5457 // Derive the collapsed' loop trip count.
5458 // TODO: Find common/largest indvar type.
5459 Value *CollapsedTripCount = nullptr;
5460 for (CanonicalLoopInfo *L : Loops) {
5461 assert(L->isValid() &&
5462 "All loops to collapse must be valid canonical loops");
5463 Value *OrigTripCount = L->getTripCount();
5464 if (!CollapsedTripCount) {
5465 CollapsedTripCount = OrigTripCount;
5466 continue;
5467 }
5468
5469 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5470 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5471 }
5472
5473 // Create the collapsed loop control flow.
5474 CanonicalLoopInfo *Result =
5475 createLoopSkeleton(DL, CollapsedTripCount, F,
5476 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5477
5478 // Build the collapsed loop body code.
5479 // Start with deriving the input loop induction variables from the collapsed
5480 // one, using a divmod scheme. To preserve the original loops' order, the
5481 // innermost loop use the least significant bits.
5482 Builder.restoreIP(Result->getBodyIP());
5483
5484 Value *Leftover = Result->getIndVar();
5485 SmallVector<Value *> NewIndVars;
5486 NewIndVars.resize(NumLoops);
5487 for (int i = NumLoops - 1; i >= 1; --i) {
5488 Value *OrigTripCount = Loops[i]->getTripCount();
5489
5490 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5491 NewIndVars[i] = NewIndVar;
5492
5493 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5494 }
5495 // Outermost loop gets all the remaining bits.
5496 NewIndVars[0] = Leftover;
5497
5498 // Construct the loop body control flow.
5499 // We progressively construct the branch structure following in direction of
5500 // the control flow, from the leading in-between code, the loop nest body, the
5501 // trailing in-between code, and rejoining the collapsed loop's latch.
5502 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5503 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5504 // its predecessors as sources.
5505 BasicBlock *ContinueBlock = Result->getBody();
5506 BasicBlock *ContinuePred = nullptr;
5507 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5508 BasicBlock *NextSrc) {
5509 if (ContinueBlock)
5510 redirectTo(ContinueBlock, Dest, DL);
5511 else
5512 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5513
5514 ContinueBlock = nullptr;
5515 ContinuePred = NextSrc;
5516 };
5517
5518 // The code before the nested loop of each level.
5519 // Because we are sinking it into the nest, it will be executed more often
5520 // that the original loop. More sophisticated schemes could keep track of what
5521 // the in-between code is and instantiate it only once per thread.
5522 for (size_t i = 0; i < NumLoops - 1; ++i)
5523 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5524
5525 // Connect the loop nest body.
5526 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5527
5528 // The code after the nested loop at each level.
5529 for (size_t i = NumLoops - 1; i > 0; --i)
5530 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5531
5532 // Connect the finished loop to the collapsed loop latch.
5533 ContinueWith(Result->getLatch(), nullptr);
5534
5535 // Replace the input loops with the new collapsed loop.
5536 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5537 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5538
5539 // Replace the input loop indvars with the derived ones.
5540 for (size_t i = 0; i < NumLoops; ++i)
5541 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5542
5543 // Remove unused parts of the input loops.
5544 removeUnusedBlocksFromParent(OldControlBBs);
5545
5546 for (CanonicalLoopInfo *L : Loops)
5547 L->invalidate();
5548
5549#ifndef NDEBUG
5550 Result->assertOK();
5551#endif
5552 return Result;
5553}
5554
5555std::vector<CanonicalLoopInfo *>
5556OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5557 ArrayRef<Value *> TileSizes) {
5558 assert(TileSizes.size() == Loops.size() &&
5559 "Must pass as many tile sizes as there are loops");
5560 int NumLoops = Loops.size();
5561 assert(NumLoops >= 1 && "At least one loop to tile required");
5562
5563 CanonicalLoopInfo *OutermostLoop = Loops.front();
5564 CanonicalLoopInfo *InnermostLoop = Loops.back();
5565 Function *F = OutermostLoop->getBody()->getParent();
5566 BasicBlock *InnerEnter = InnermostLoop->getBody();
5567 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5568
5569 // Loop control blocks that may become orphaned later.
5570 SmallVector<BasicBlock *, 12> OldControlBBs;
5571 OldControlBBs.reserve(6 * Loops.size());
5572 for (CanonicalLoopInfo *Loop : Loops)
5573 Loop->collectControlBlocks(OldControlBBs);
5574
5575 // Collect original trip counts and induction variable to be accessible by
5576 // index. Also, the structure of the original loops is not preserved during
5577 // the construction of the tiled loops, so do it before we scavenge the BBs of
5578 // any original CanonicalLoopInfo.
5579 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5580 for (CanonicalLoopInfo *L : Loops) {
5581 assert(L->isValid() && "All input loops must be valid canonical loops");
5582 OrigTripCounts.push_back(L->getTripCount());
5583 OrigIndVars.push_back(L->getIndVar());
5584 }
5585
5586 // Collect the code between loop headers. These may contain SSA definitions
5587 // that are used in the loop nest body. To be usable with in the innermost
5588 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5589 // these instructions may be executed more often than before the tiling.
5590 // TODO: It would be sufficient to only sink them into body of the
5591 // corresponding tile loop.
5593 for (int i = 0; i < NumLoops - 1; ++i) {
5594 CanonicalLoopInfo *Surrounding = Loops[i];
5595 CanonicalLoopInfo *Nested = Loops[i + 1];
5596
5597 BasicBlock *EnterBB = Surrounding->getBody();
5598 BasicBlock *ExitBB = Nested->getHeader();
5599 InbetweenCode.emplace_back(EnterBB, ExitBB);
5600 }
5601
5602 // Compute the trip counts of the floor loops.
5603 Builder.SetCurrentDebugLocation(DL);
5604 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5605 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5606 for (int i = 0; i < NumLoops; ++i) {
5607 Value *TileSize = TileSizes[i];
5608 Value *OrigTripCount = OrigTripCounts[i];
5609 Type *IVType = OrigTripCount->getType();
5610
5611 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5612 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5613
5614 // 0 if tripcount divides the tilesize, 1 otherwise.
5615 // 1 means we need an additional iteration for a partial tile.
5616 //
5617 // Unfortunately we cannot just use the roundup-formula
5618 // (tripcount + tilesize - 1)/tilesize
5619 // because the summation might overflow. We do not want introduce undefined
5620 // behavior when the untiled loop nest did not.
5621 Value *FloorTripOverflow =
5622 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5623
5624 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5625 Value *FloorTripCount =
5626 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5627 "omp_floor" + Twine(i) + ".tripcount", true);
5628
5629 // Remember some values for later use.
5630 FloorCompleteCount.push_back(FloorCompleteTripCount);
5631 FloorCount.push_back(FloorTripCount);
5632 FloorRems.push_back(FloorTripRem);
5633 }
5634
5635 // Generate the new loop nest, from the outermost to the innermost.
5636 std::vector<CanonicalLoopInfo *> Result;
5637 Result.reserve(NumLoops * 2);
5638
5639 // The basic block of the surrounding loop that enters the nest generated
5640 // loop.
5641 BasicBlock *Enter = OutermostLoop->getPreheader();
5642
5643 // The basic block of the surrounding loop where the inner code should
5644 // continue.
5645 BasicBlock *Continue = OutermostLoop->getAfter();
5646
5647 // Where the next loop basic block should be inserted.
5648 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5649
5650 auto EmbeddNewLoop =
5651 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5652 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5653 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5654 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5655 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5656 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5657
5658 // Setup the position where the next embedded loop connects to this loop.
5659 Enter = EmbeddedLoop->getBody();
5660 Continue = EmbeddedLoop->getLatch();
5661 OutroInsertBefore = EmbeddedLoop->getLatch();
5662 return EmbeddedLoop;
5663 };
5664
5665 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5666 const Twine &NameBase) {
5667 for (auto P : enumerate(TripCounts)) {
5668 CanonicalLoopInfo *EmbeddedLoop =
5669 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5670 Result.push_back(EmbeddedLoop);
5671 }
5672 };
5673
5674 EmbeddNewLoops(FloorCount, "floor");
5675
5676 // Within the innermost floor loop, emit the code that computes the tile
5677 // sizes.
5678 Builder.SetInsertPoint(Enter->getTerminator());
5679 SmallVector<Value *, 4> TileCounts;
5680 for (int i = 0; i < NumLoops; ++i) {
5681 CanonicalLoopInfo *FloorLoop = Result[i];
5682 Value *TileSize = TileSizes[i];
5683
5684 Value *FloorIsEpilogue =
5685 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5686 Value *TileTripCount =
5687 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5688
5689 TileCounts.push_back(TileTripCount);
5690 }
5691
5692 // Create the tile loops.
5693 EmbeddNewLoops(TileCounts, "tile");
5694
5695 // Insert the inbetween code into the body.
5696 BasicBlock *BodyEnter = Enter;
5697 BasicBlock *BodyEntered = nullptr;
5698 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5699 BasicBlock *EnterBB = P.first;
5700 BasicBlock *ExitBB = P.second;
5701
5702 if (BodyEnter)
5703 redirectTo(BodyEnter, EnterBB, DL);
5704 else
5705 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5706
5707 BodyEnter = nullptr;
5708 BodyEntered = ExitBB;
5709 }
5710
5711 // Append the original loop nest body into the generated loop nest body.
5712 if (BodyEnter)
5713 redirectTo(BodyEnter, InnerEnter, DL);
5714 else
5715 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5717
5718 // Replace the original induction variable with an induction variable computed
5719 // from the tile and floor induction variables.
5720 Builder.restoreIP(Result.back()->getBodyIP());
5721 for (int i = 0; i < NumLoops; ++i) {
5722 CanonicalLoopInfo *FloorLoop = Result[i];
5723 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5724 Value *OrigIndVar = OrigIndVars[i];
5725 Value *Size = TileSizes[i];
5726
5727 Value *Scale =
5728 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5729 Value *Shift =
5730 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5731 OrigIndVar->replaceAllUsesWith(Shift);
5732 }
5733
5734 // Remove unused parts of the original loops.
5735 removeUnusedBlocksFromParent(OldControlBBs);
5736
5737 for (CanonicalLoopInfo *L : Loops)
5738 L->invalidate();
5739
5740#ifndef NDEBUG
5741 for (CanonicalLoopInfo *GenL : Result)
5742 GenL->assertOK();
5743#endif
5744 return Result;
5745}
5746
5747/// Attach metadata \p Properties to the basic block described by \p BB. If the
5748/// basic block already has metadata, the basic block properties are appended.
5750 ArrayRef<Metadata *> Properties) {
5751 // Nothing to do if no property to attach.
5752 if (Properties.empty())
5753 return;
5754
5755 LLVMContext &Ctx = BB->getContext();
5756 SmallVector<Metadata *> NewProperties;
5757 NewProperties.push_back(nullptr);
5758
5759 // If the basic block already has metadata, prepend it to the new metadata.
5760 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5761 if (Existing)
5762 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5763
5764 append_range(NewProperties, Properties);
5765 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5766 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5767
5768 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5769}
5770
5771/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5772/// loop already has metadata, the loop properties are appended.
5773static void addLoopMetadata(CanonicalLoopInfo *Loop,
5774 ArrayRef<Metadata *> Properties) {
5775 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5776
5777 // Attach metadata to the loop's latch
5778 BasicBlock *Latch = Loop->getLatch();
5779 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5780 addBasicBlockMetadata(Latch, Properties);
5781}
5782
5783/// Attach llvm.access.group metadata to the memref instructions of \p Block
5784static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5785 LoopInfo &LI) {
5786 for (Instruction &I : *Block) {
5787 if (I.mayReadOrWriteMemory()) {
5788 // TODO: This instruction may already have access group from
5789 // other pragmas e.g. #pragma clang loop vectorize. Append
5790 // so that the existing metadata is not overwritten.
5791 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5792 }
5793 }
5794}
5795
5796void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5797 LLVMContext &Ctx = Builder.getContext();
5799 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5800 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5801}
5802
5803void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5804 LLVMContext &Ctx = Builder.getContext();
5806 Loop, {
5807 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5808 });
5809}
5810
5811void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5812 Value *IfCond, ValueToValueMapTy &VMap,
5813 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5814 const Twine &NamePrefix) {
5815 Function *F = CanonicalLoop->getFunction();
5816
5817 // We can't do
5818 // if (cond) {
5819 // simd_loop;
5820 // } else {
5821 // non_simd_loop;
5822 // }
5823 // because then the CanonicalLoopInfo would only point to one of the loops:
5824 // leading to other constructs operating on the same loop to malfunction.
5825 // Instead generate
5826 // while (...) {
5827 // if (cond) {
5828 // simd_body;
5829 // } else {
5830 // not_simd_body;
5831 // }
5832 // }
5833 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5834 // body at -O3
5835
5836 // Define where if branch should be inserted
5837 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5838
5839 // Create additional blocks for the if statement
5840 BasicBlock *Cond = SplitBeforeIt->getParent();
5841 llvm::LLVMContext &C = Cond->getContext();
5843 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5845 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5846
5847 // Create if condition branch.
5848 Builder.SetInsertPoint(SplitBeforeIt);
5849 Instruction *BrInstr =
5850 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5851 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5852 // Then block contains branch to omp loop body which needs to be vectorized
5853 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5854 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5855
5856 Builder.SetInsertPoint(ElseBlock);
5857
5858 // Clone loop for the else branch
5860
5861 SmallVector<BasicBlock *, 8> ExistingBlocks;
5862 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5863 ExistingBlocks.push_back(ThenBlock);
5864 ExistingBlocks.append(L->block_begin(), L->block_end());
5865 // Cond is the block that has the if clause condition
5866 // LoopCond is omp_loop.cond
5867 // LoopHeader is omp_loop.header
5868 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5869 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5870 assert(LoopCond && LoopHeader && "Invalid loop structure");
5871 for (BasicBlock *Block : ExistingBlocks) {
5872 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5873 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5874 continue;
5875 }
5876 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5877
5878 // fix name not to be omp.if.then
5879 if (Block == ThenBlock)
5880 NewBB->setName(NamePrefix + ".if.else");
5881
5882 NewBB->moveBefore(CanonicalLoop->getExit());
5883 VMap[Block] = NewBB;
5884 NewBlocks.push_back(NewBB);
5885 }
5886 remapInstructionsInBlocks(NewBlocks, VMap);
5887 Builder.CreateBr(NewBlocks.front());
5888
5889 // The loop latch must have only one predecessor. Currently it is branched to
5890 // from both the 'then' and 'else' branches.
5891 L->getLoopLatch()->splitBasicBlock(
5892 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5893
5894 // Ensure that the then block is added to the loop so we add the attributes in
5895 // the next step
5896 L->addBasicBlockToLoop(ThenBlock, LI);
5897}
5898
5899unsigned
5900OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5901 const StringMap<bool> &Features) {
5902 if (TargetTriple.isX86()) {
5903 if (Features.lookup("avx512f"))
5904 return 512;
5905 else if (Features.lookup("avx"))
5906 return 256;
5907 return 128;
5908 }
5909 if (TargetTriple.isPPC())
5910 return 128;
5911 if (TargetTriple.isWasm())
5912 return 128;
5913 return 0;
5914}
5915
5916void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5917 MapVector<Value *, Value *> AlignedVars,
5918 Value *IfCond, OrderKind Order,
5919 ConstantInt *Simdlen, ConstantInt *Safelen) {
5920 LLVMContext &Ctx = Builder.getContext();
5921
5922 Function *F = CanonicalLoop->getFunction();
5923
5924 // TODO: We should not rely on pass manager. Currently we use pass manager
5925 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5926 // object. We should have a method which returns all blocks between
5927 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5929 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5930 FAM.registerPass([]() { return LoopAnalysis(); });
5931 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5932
5933 LoopAnalysis LIA;
5934 LoopInfo &&LI = LIA.run(*F, FAM);
5935
5936 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5937 if (AlignedVars.size()) {
5938 InsertPointTy IP = Builder.saveIP();
5939 for (auto &AlignedItem : AlignedVars) {
5940 Value *AlignedPtr = AlignedItem.first;
5941 Value *Alignment = AlignedItem.second;
5942 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5943 Builder.SetInsertPoint(loadInst->getNextNode());
5944 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5945 Alignment);
5946 }
5947 Builder.restoreIP(IP);
5948 }
5949
5950 if (IfCond) {
5951 ValueToValueMapTy VMap;
5952 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5953 }
5954
5956
5957 // Get the basic blocks from the loop in which memref instructions
5958 // can be found.
5959 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5960 // preferably without running any passes.
5961 for (BasicBlock *Block : L->getBlocks()) {
5962 if (Block == CanonicalLoop->getCond() ||
5963 Block == CanonicalLoop->getHeader())
5964 continue;
5965 Reachable.insert(Block);
5966 }
5967
5968 SmallVector<Metadata *> LoopMDList;
5969
5970 // In presence of finite 'safelen', it may be unsafe to mark all
5971 // the memory instructions parallel, because loop-carried
5972 // dependences of 'safelen' iterations are possible.
5973 // If clause order(concurrent) is specified then the memory instructions
5974 // are marked parallel even if 'safelen' is finite.
5975 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5976 // Add access group metadata to memory-access instructions.
5977 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5978 for (BasicBlock *BB : Reachable)
5979 addSimdMetadata(BB, AccessGroup, LI);
5980 // TODO: If the loop has existing parallel access metadata, have
5981 // to combine two lists.
5982 LoopMDList.push_back(MDNode::get(
5983 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5984 }
5985
5986 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5987 // versions so we can't add the loop attributes in that case.
5988 if (IfCond) {
5989 // we can still add llvm.loop.parallel_access
5990 addLoopMetadata(CanonicalLoop, LoopMDList);
5991 return;
5992 }
5993
5994 // Use the above access group metadata to create loop level
5995 // metadata, which should be distinct for each loop.
5996 ConstantAsMetadata *BoolConst =
5998 LoopMDList.push_back(MDNode::get(
5999 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6000
6001 if (Simdlen || Safelen) {
6002 // If both simdlen and safelen clauses are specified, the value of the
6003 // simdlen parameter must be less than or equal to the value of the safelen
6004 // parameter. Therefore, use safelen only in the absence of simdlen.
6005 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6006 LoopMDList.push_back(
6007 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6008 ConstantAsMetadata::get(VectorizeWidth)}));
6009 }
6010
6011 addLoopMetadata(CanonicalLoop, LoopMDList);
6012}
6013
6014/// Create the TargetMachine object to query the backend for optimization
6015/// preferences.
6016///
6017/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6018/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6019/// needed for the LLVM pass pipline. We use some default options to avoid
6020/// having to pass too many settings from the frontend that probably do not
6021/// matter.
6022///
6023/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6024/// method. If we are going to use TargetMachine for more purposes, especially
6025/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6026/// might become be worth requiring front-ends to pass on their TargetMachine,
6027/// or at least cache it between methods. Note that while fontends such as Clang
6028/// have just a single main TargetMachine per translation unit, "target-cpu" and
6029/// "target-features" that determine the TargetMachine are per-function and can
6030/// be overrided using __attribute__((target("OPTIONS"))).
6031static std::unique_ptr<TargetMachine>
6033 Module *M = F->getParent();
6034
6035 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6036 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6037 const llvm::Triple &Triple = M->getTargetTriple();
6038
6039 std::string Error;
6041 if (!TheTarget)
6042 return {};
6043
6045 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6046 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6047 /*CodeModel=*/std::nullopt, OptLevel));
6048}
6049
6050/// Heuristically determine the best-performant unroll factor for \p CLI. This
6051/// depends on the target processor. We are re-using the same heuristics as the
6052/// LoopUnrollPass.
6053static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6054 Function *F = CLI->getFunction();
6055
6056 // Assume the user requests the most aggressive unrolling, even if the rest of
6057 // the code is optimized using a lower setting.
6059 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6060
6062 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6063 FAM.registerPass([]() { return AssumptionAnalysis(); });
6064 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6065 FAM.registerPass([]() { return LoopAnalysis(); });
6066 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6067 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6068 TargetIRAnalysis TIRA;
6069 if (TM)
6070 TIRA = TargetIRAnalysis(
6071 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6072 FAM.registerPass([&]() { return TIRA; });
6073
6074 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6076 ScalarEvolution &&SE = SEA.run(*F, FAM);
6078 DominatorTree &&DT = DTA.run(*F, FAM);
6079 LoopAnalysis LIA;
6080 LoopInfo &&LI = LIA.run(*F, FAM);
6082 AssumptionCache &&AC = ACT.run(*F, FAM);
6084
6085 Loop *L = LI.getLoopFor(CLI->getHeader());
6086 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6087
6089 L, SE, TTI,
6090 /*BlockFrequencyInfo=*/nullptr,
6091 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6092 /*UserThreshold=*/std::nullopt,
6093 /*UserCount=*/std::nullopt,
6094 /*UserAllowPartial=*/true,
6095 /*UserAllowRuntime=*/true,
6096 /*UserUpperBound=*/std::nullopt,
6097 /*UserFullUnrollMaxCount=*/std::nullopt);
6098
6099 UP.Force = true;
6100
6101 // Account for additional optimizations taking place before the LoopUnrollPass
6102 // would unroll the loop.
6105
6106 // Use normal unroll factors even if the rest of the code is optimized for
6107 // size.
6110
6111 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6112 << " Threshold=" << UP.Threshold << "\n"
6113 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6114 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6115 << " PartialOptSizeThreshold="
6116 << UP.PartialOptSizeThreshold << "\n");
6117
6118 // Disable peeling.
6121 /*UserAllowPeeling=*/false,
6122 /*UserAllowProfileBasedPeeling=*/false,
6123 /*UnrollingSpecficValues=*/false);
6124
6126 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6127
6128 // Assume that reads and writes to stack variables can be eliminated by
6129 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6130 // size.
6131 for (BasicBlock *BB : L->blocks()) {
6132 for (Instruction &I : *BB) {
6133 Value *Ptr;
6134 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6135 Ptr = Load->getPointerOperand();
6136 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6137 Ptr = Store->getPointerOperand();
6138 } else
6139 continue;
6140
6141 Ptr = Ptr->stripPointerCasts();
6142
6143 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6144 if (Alloca->getParent() == &F->getEntryBlock())
6145 EphValues.insert(&I);
6146 }
6147 }
6148 }
6149
6150 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6151
6152 // Loop is not unrollable if the loop contains certain instructions.
6153 if (!UCE.canUnroll()) {
6154 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6155 return 1;
6156 }
6157
6158 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6159 << "\n");
6160
6161 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6162 // be able to use it.
6163 int TripCount = 0;
6164 int MaxTripCount = 0;
6165 bool MaxOrZero = false;
6166 unsigned TripMultiple = 0;
6167
6168 bool UseUpperBound = false;
6169 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6170 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6171 UseUpperBound);
6172 unsigned Factor = UP.Count;
6173 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6174
6175 // This function returns 1 to signal to not unroll a loop.
6176 if (Factor == 0)
6177 return 1;
6178 return Factor;
6179}
6180
6181void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6182 int32_t Factor,
6183 CanonicalLoopInfo **UnrolledCLI) {
6184 assert(Factor >= 0 && "Unroll factor must not be negative");
6185
6186 Function *F = Loop->getFunction();
6187 LLVMContext &Ctx = F->getContext();
6188
6189 // If the unrolled loop is not used for another loop-associated directive, it
6190 // is sufficient to add metadata for the LoopUnrollPass.
6191 if (!UnrolledCLI) {
6192 SmallVector<Metadata *, 2> LoopMetadata;
6193 LoopMetadata.push_back(
6194 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6195
6196 if (Factor >= 1) {
6198 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6199 LoopMetadata.push_back(MDNode::get(
6200 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6201 }
6202
6203 addLoopMetadata(Loop, LoopMetadata);
6204 return;
6205 }
6206
6207 // Heuristically determine the unroll factor.
6208 if (Factor == 0)
6210
6211 // No change required with unroll factor 1.
6212 if (Factor == 1) {
6213 *UnrolledCLI = Loop;
6214 return;
6215 }
6216
6217 assert(Factor >= 2 &&
6218 "unrolling only makes sense with a factor of 2 or larger");
6219
6220 Type *IndVarTy = Loop->getIndVarType();
6221
6222 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6223 // unroll the inner loop.
6224 Value *FactorVal =
6225 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6226 /*isSigned=*/false));
6227 std::vector<CanonicalLoopInfo *> LoopNest =
6228 tileLoops(DL, {Loop}, {FactorVal});
6229 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6230 *UnrolledCLI = LoopNest[0];
6231 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6232
6233 // LoopUnrollPass can only fully unroll loops with constant trip count.
6234 // Unroll by the unroll factor with a fallback epilog for the remainder
6235 // iterations if necessary.
6237 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6239 InnerLoop,
6240 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6242 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6243
6244#ifndef NDEBUG
6245 (*UnrolledCLI)->assertOK();
6246#endif
6247}
6248
6249OpenMPIRBuilder::InsertPointTy
6250OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6251 llvm::Value *BufSize, llvm::Value *CpyBuf,
6252 llvm::Value *CpyFn, llvm::Value *DidIt) {
6253 if (!updateToLocation(Loc))
6254 return Loc.IP;
6255
6256 uint32_t SrcLocStrSize;
6257 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6258 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6259 Value *ThreadId = getOrCreateThreadID(Ident);
6260
6261 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6262
6263 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6264
6265 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6266 Builder.CreateCall(Fn, Args);
6267
6268 return Builder.saveIP();
6269}
6270
6271OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6272 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6273 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6275
6276 if (!updateToLocation(Loc))
6277 return Loc.IP;
6278
6279 // If needed allocate and initialize `DidIt` with 0.
6280 // DidIt: flag variable: 1=single thread; 0=not single thread.
6281 llvm::Value *DidIt = nullptr;
6282 if (!CPVars.empty()) {
6283 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6284 Builder.CreateStore(Builder.getInt32(0), DidIt);
6285 }
6286
6287 Directive OMPD = Directive::OMPD_single;
6288 uint32_t SrcLocStrSize;
6289 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6290 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6291 Value *ThreadId = getOrCreateThreadID(Ident);
6292 Value *Args[] = {Ident, ThreadId};
6293
6294 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6295 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6296
6297 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6298 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6299
6300 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6301 if (Error Err = FiniCB(IP))
6302 return Err;
6303
6304 // The thread that executes the single region must set `DidIt` to 1.
6305 // This is used by __kmpc_copyprivate, to know if the caller is the
6306 // single thread or not.
6307 if (DidIt)
6308 Builder.CreateStore(Builder.getInt32(1), DidIt);
6309
6310 return Error::success();
6311 };
6312
6313 // generates the following:
6314 // if (__kmpc_single()) {
6315 // .... single region ...
6316 // __kmpc_end_single
6317 // }
6318 // __kmpc_copyprivate
6319 // __kmpc_barrier
6320
6321 InsertPointOrErrorTy AfterIP =
6322 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6323 /*Conditional*/ true,
6324 /*hasFinalize*/ true);
6325 if (!AfterIP)
6326 return AfterIP.takeError();
6327
6328 if (DidIt) {
6329 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6330 // NOTE BufSize is currently unused, so just pass 0.
6331 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6332 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6333 CPFuncs[I], DidIt);
6334 // NOTE __kmpc_copyprivate already inserts a barrier
6335 } else if (!IsNowait) {
6336 InsertPointOrErrorTy AfterIP =
6337 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6338 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6339 /* CheckCancelFlag */ false);
6340 if (!AfterIP)
6341 return AfterIP.takeError();
6342 }
6343 return Builder.saveIP();
6344}
6345
6346OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6347 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6348 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6349
6350 if (!updateToLocation(Loc))
6351 return Loc.IP;
6352
6353 Directive OMPD = Directive::OMPD_critical;
6354 uint32_t SrcLocStrSize;
6355 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6356 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6357 Value *ThreadId = getOrCreateThreadID(Ident);
6358 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6359 Value *Args[] = {Ident, ThreadId, LockVar};
6360
6361 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6362 Function *RTFn = nullptr;
6363 if (HintInst) {
6364 // Add Hint to entry Args and create call
6365 EnterArgs.push_back(HintInst);
6366 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6367 } else {
6368 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6369 }
6370 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6371
6372 Function *ExitRTLFn =
6373 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6374 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6375
6376 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6377 /*Conditional*/ false, /*hasFinalize*/ true);
6378}
6379
6380OpenMPIRBuilder::InsertPointTy
6381OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6382 InsertPointTy AllocaIP, unsigned NumLoops,
6383 ArrayRef<llvm::Value *> StoreValues,
6384 const Twine &Name, bool IsDependSource) {
6385 assert(
6386 llvm::all_of(StoreValues,
6387 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6388 "OpenMP runtime requires depend vec with i64 type");
6389
6390 if (!updateToLocation(Loc))
6391 return Loc.IP;
6392
6393 // Allocate space for vector and generate alloc instruction.
6394 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6395 Builder.restoreIP(AllocaIP);
6396 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6397 ArgsBase->setAlignment(Align(8));
6398 updateToLocation(Loc);
6399
6400 // Store the index value with offset in depend vector.
6401 for (unsigned I = 0; I < NumLoops; ++I) {
6402 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6403 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6404 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6405 STInst->setAlignment(Align(8));
6406 }
6407
6408 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6409 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6410
6411 uint32_t SrcLocStrSize;
6412 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6413 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6414 Value *ThreadId = getOrCreateThreadID(Ident);
6415 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6416
6417 Function *RTLFn = nullptr;
6418 if (IsDependSource)
6419 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6420 else
6421 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6422 Builder.CreateCall(RTLFn, Args);
6423
6424 return Builder.saveIP();
6425}
6426
6427OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6428 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6429 FinalizeCallbackTy FiniCB, bool IsThreads) {
6430 if (!updateToLocation(Loc))
6431 return Loc.IP;
6432
6433 Directive OMPD = Directive::OMPD_ordered;
6434 Instruction *EntryCall = nullptr;
6435 Instruction *ExitCall = nullptr;
6436
6437 if (IsThreads) {
6438 uint32_t SrcLocStrSize;
6439 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6440 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6441 Value *ThreadId = getOrCreateThreadID(Ident);
6442 Value *Args[] = {Ident, ThreadId};
6443
6444 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6445 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6446
6447 Function *ExitRTLFn =
6448 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6449 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6450 }
6451
6452 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6453 /*Conditional*/ false, /*hasFinalize*/ true);
6454}
6455
6456OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6457 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6458 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6459 bool HasFinalize, bool IsCancellable) {
6460
6461 if (HasFinalize)
6462 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6463
6464 // Create inlined region's entry and body blocks, in preparation
6465 // for conditional creation
6466 BasicBlock *EntryBB = Builder.GetInsertBlock();
6467 Instruction *SplitPos = EntryBB->getTerminator();
6468 if (!isa_and_nonnull<BranchInst>(SplitPos))
6469 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6470 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6471 BasicBlock *FiniBB =
6472 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6473
6474 Builder.SetInsertPoint(EntryBB->getTerminator());
6475 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6476
6477 // generate body
6478 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6479 /* CodeGenIP */ Builder.saveIP()))
6480 return Err;
6481
6482 // emit exit call and do any needed finalization.
6483 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6484 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6485 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6486 "Unexpected control flow graph state!!");
6487 InsertPointOrErrorTy AfterIP =
6488 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6489 if (!AfterIP)
6490 return AfterIP.takeError();
6491 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6492 "Unexpected Control Flow State!");
6494
6495 // If we are skipping the region of a non conditional, remove the exit
6496 // block, and clear the builder's insertion point.
6497 assert(SplitPos->getParent() == ExitBB &&
6498 "Unexpected Insertion point location!");
6499 auto merged = MergeBlockIntoPredecessor(ExitBB);
6500 BasicBlock *ExitPredBB = SplitPos->getParent();
6501 auto InsertBB = merged ? ExitPredBB : ExitBB;
6502 if (!isa_and_nonnull<BranchInst>(SplitPos))
6503 SplitPos->eraseFromParent();
6504 Builder.SetInsertPoint(InsertBB);
6505
6506 return Builder.saveIP();
6507}
6508
6509OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6510 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6511 // if nothing to do, Return current insertion point.
6512 if (!Conditional || !EntryCall)
6513 return Builder.saveIP();
6514
6515 BasicBlock *EntryBB = Builder.GetInsertBlock();
6516 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6517 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6518 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6519
6520 // Emit thenBB and set the Builder's insertion point there for
6521 // body generation next. Place the block after the current block.
6522 Function *CurFn = EntryBB->getParent();
6523 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6524
6525 // Move Entry branch to end of ThenBB, and replace with conditional
6526 // branch (If-stmt)
6527 Instruction *EntryBBTI = EntryBB->getTerminator();
6528 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6529 EntryBBTI->removeFromParent();
6530 Builder.SetInsertPoint(UI);
6531 Builder.Insert(EntryBBTI);
6532 UI->eraseFromParent();
6533 Builder.SetInsertPoint(ThenBB->getTerminator());
6534
6535 // return an insertion point to ExitBB.
6536 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6537}
6538
6539OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6540 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6541 bool HasFinalize) {
6542
6543 Builder.restoreIP(FinIP);
6544
6545 // If there is finalization to do, emit it before the exit call
6546 if (HasFinalize) {
6547 assert(!FinalizationStack.empty() &&
6548 "Unexpected finalization stack state!");
6549
6550 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6551 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6552
6553 if (Error Err = Fi.FiniCB(FinIP))
6554 return Err;
6555
6556 BasicBlock *FiniBB = FinIP.getBlock();
6557 Instruction *FiniBBTI = FiniBB->getTerminator();
6558
6559 // set Builder IP for call creation
6560 Builder.SetInsertPoint(FiniBBTI);
6561 }
6562
6563 if (!ExitCall)
6564 return Builder.saveIP();
6565
6566 // place the Exitcall as last instruction before Finalization block terminator
6567 ExitCall->removeFromParent();
6568 Builder.Insert(ExitCall);
6569
6570 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6571 ExitCall->getIterator());
6572}
6573
6574OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6575 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6576 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6577 if (!IP.isSet())
6578 return IP;
6579
6580 IRBuilder<>::InsertPointGuard IPG(Builder);
6581
6582 // creates the following CFG structure
6583 // OMP_Entry : (MasterAddr != PrivateAddr)?
6584 // F T
6585 // | \
6586 // | copin.not.master
6587 // | /
6588 // v /
6589 // copyin.not.master.end
6590 // |
6591 // v
6592 // OMP.Entry.Next
6593
6594 BasicBlock *OMP_Entry = IP.getBlock();
6595 Function *CurFn = OMP_Entry->getParent();
6596 BasicBlock *CopyBegin =
6597 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6598 BasicBlock *CopyEnd = nullptr;
6599
6600 // If entry block is terminated, split to preserve the branch to following
6601 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6602 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6603 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6604 "copyin.not.master.end");
6605 OMP_Entry->getTerminator()->eraseFromParent();
6606 } else {
6607 CopyEnd =
6608 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6609 }
6610
6611 Builder.SetInsertPoint(OMP_Entry);
6612 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6613 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6614 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6615 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6616
6617 Builder.SetInsertPoint(CopyBegin);
6618 if (BranchtoEnd)
6619 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6620
6621 return Builder.saveIP();
6622}
6623
6624CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6626 std::string Name) {
6627 IRBuilder<>::InsertPointGuard IPG(Builder);
6628 updateToLocation(Loc);
6629
6630 uint32_t SrcLocStrSize;
6631 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6632 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6633 Value *ThreadId = getOrCreateThreadID(Ident);
6634 Value *Args[] = {ThreadId, Size, Allocator};
6635
6636 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6637
6638 return Builder.CreateCall(Fn, Args, Name);
6639}
6640
6641CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6642 Value *Addr, Value *Allocator,
6643 std::string Name) {
6644 IRBuilder<>::InsertPointGuard IPG(Builder);
6645 updateToLocation(Loc);
6646
6647 uint32_t SrcLocStrSize;
6648 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6649 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6650 Value *ThreadId = getOrCreateThreadID(Ident);
6651 Value *Args[] = {ThreadId, Addr, Allocator};
6652 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6653 return Builder.CreateCall(Fn, Args, Name);
6654}
6655
6656CallInst *OpenMPIRBuilder::createOMPInteropInit(
6657 const LocationDescription &Loc, Value *InteropVar,
6658 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6659 Value *DependenceAddress, bool HaveNowaitClause) {
6660 IRBuilder<>::InsertPointGuard IPG(Builder);
6661 updateToLocation(Loc);
6662
6663 uint32_t SrcLocStrSize;
6664 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6665 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6666 Value *ThreadId = getOrCreateThreadID(Ident);
6667 if (Device == nullptr)
6669 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6670 if (NumDependences == nullptr) {
6671 NumDependences = ConstantInt::get(Int32, 0);
6672 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6673 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6674 }
6675 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6676 Value *Args[] = {
6677 Ident, ThreadId, InteropVar, InteropTypeVal,
6678 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6679
6680 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6681
6682 return Builder.CreateCall(Fn, Args);
6683}
6684
6685CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6686 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6687 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6688 IRBuilder<>::InsertPointGuard IPG(Builder);
6689 updateToLocation(Loc);
6690
6691 uint32_t SrcLocStrSize;
6692 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6693 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6694 Value *ThreadId = getOrCreateThreadID(Ident);
6695 if (Device == nullptr)
6697 if (NumDependences == nullptr) {
6698 NumDependences = ConstantInt::get(Int32, 0);
6699 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6700 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6701 }
6702 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6703 Value *Args[] = {
6704 Ident, ThreadId, InteropVar, Device,
6705 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6706
6707 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6708
6709 return Builder.CreateCall(Fn, Args);
6710}
6711
6712CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6713 Value *InteropVar, Value *Device,
6714 Value *NumDependences,
6715 Value *DependenceAddress,
6716 bool HaveNowaitClause) {
6717 IRBuilder<>::InsertPointGuard IPG(Builder);
6718 updateToLocation(Loc);
6719 uint32_t SrcLocStrSize;
6720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6722 Value *ThreadId = getOrCreateThreadID(Ident);
6723 if (Device == nullptr)
6725 if (NumDependences == nullptr) {
6726 NumDependences = ConstantInt::get(Int32, 0);
6727 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6728 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6729 }
6730 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6731 Value *Args[] = {
6732 Ident, ThreadId, InteropVar, Device,
6733 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6734
6735 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6736
6737 return Builder.CreateCall(Fn, Args);
6738}
6739
6740CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6741 const LocationDescription &Loc, llvm::Value *Pointer,
6742 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6743 IRBuilder<>::InsertPointGuard IPG(Builder);
6744 updateToLocation(Loc);
6745
6746 uint32_t SrcLocStrSize;
6747 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6748 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6749 Value *ThreadId = getOrCreateThreadID(Ident);
6750 Constant *ThreadPrivateCache =
6751 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6752 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6753
6754 Function *Fn =
6755 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6756
6757 return Builder.CreateCall(Fn, Args);
6758}
6759
6760OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6761 const LocationDescription &Loc,
6762 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6763 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6764 "expected num_threads and num_teams to be specified");
6765
6766 if (!updateToLocation(Loc))
6767 return Loc.IP;
6768
6769 uint32_t SrcLocStrSize;
6770 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6771 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6772 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6773 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6774 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6775 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6776 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6777
6778 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6779 Function *Kernel = DebugKernelWrapper;
6780
6781 // We need to strip the debug prefix to get the correct kernel name.
6782 StringRef KernelName = Kernel->getName();
6783 const std::string DebugPrefix = "_debug__";
6784 if (KernelName.ends_with(DebugPrefix)) {
6785 KernelName = KernelName.drop_back(DebugPrefix.length());
6786 Kernel = M.getFunction(KernelName);
6787 assert(Kernel && "Expected the real kernel to exist");
6788 }
6789
6790 // Manifest the launch configuration in the metadata matching the kernel
6791 // environment.
6792 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6793 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6794
6795 // If MaxThreads not set, select the maximum between the default workgroup
6796 // size and the MinThreads value.
6797 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6798 if (MaxThreadsVal < 0)
6799 MaxThreadsVal = std::max(
6800 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6801
6802 if (MaxThreadsVal > 0)
6803 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6804
6805 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6807 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6808 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6809 Constant *ReductionDataSize =
6810 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6811 Constant *ReductionBufferLength =
6812 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6813
6814 Function *Fn = getOrCreateRuntimeFunctionPtr(
6815 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6816 const DataLayout &DL = Fn->getDataLayout();
6817
6818 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6819 Constant *DynamicEnvironmentInitializer =
6820 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6821 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6822 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6823 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6824 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6825 DL.getDefaultGlobalsAddressSpace());
6826 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6827
6828 Constant *DynamicEnvironment =
6829 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6830 ? DynamicEnvironmentGV
6831 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6832 DynamicEnvironmentPtr);
6833
6834 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6835 ConfigurationEnvironment, {
6836 UseGenericStateMachineVal,
6837 MayUseNestedParallelismVal,
6838 IsSPMDVal,
6839 MinThreads,
6840 MaxThreads,
6841 MinTeams,
6842 MaxTeams,
6843 ReductionDataSize,
6844 ReductionBufferLength,
6845 });
6846 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6847 KernelEnvironment, {
6848 ConfigurationEnvironmentInitializer,
6849 Ident,
6850 DynamicEnvironment,
6851 });
6852 std::string KernelEnvironmentName =
6853 (KernelName + "_kernel_environment").str();
6854 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6855 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6856 KernelEnvironmentInitializer, KernelEnvironmentName,
6857 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6858 DL.getDefaultGlobalsAddressSpace());
6859 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6860
6861 Constant *KernelEnvironment =
6862 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6863 ? KernelEnvironmentGV
6864 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6865 KernelEnvironmentPtr);
6866 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6867 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6868 KernelLaunchEnvironment =
6869 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6870 ? KernelLaunchEnvironment
6871 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6872 KernelLaunchEnvParamTy);
6873 CallInst *ThreadKind =
6874 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6875
6876 Value *ExecUserCode = Builder.CreateICmpEQ(
6877 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6878 "exec_user_code");
6879
6880 // ThreadKind = __kmpc_target_init(...)
6881 // if (ThreadKind == -1)
6882 // user_code
6883 // else
6884 // return;
6885
6886 auto *UI = Builder.CreateUnreachable();
6887 BasicBlock *CheckBB = UI->getParent();
6888 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6889
6890 BasicBlock *WorkerExitBB = BasicBlock::Create(
6891 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6892 Builder.SetInsertPoint(WorkerExitBB);
6893 Builder.CreateRetVoid();
6894
6895 auto *CheckBBTI = CheckBB->getTerminator();
6896 Builder.SetInsertPoint(CheckBBTI);
6897 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6898
6899 CheckBBTI->eraseFromParent();
6900 UI->eraseFromParent();
6901
6902 // Continue in the "user_code" block, see diagram above and in
6903 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6904 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6905}
6906
6907void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6908 int32_t TeamsReductionDataSize,
6909 int32_t TeamsReductionBufferLength) {
6910 if (!updateToLocation(Loc))
6911 return;
6912
6913 Function *Fn = getOrCreateRuntimeFunctionPtr(
6914 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6915
6916 Builder.CreateCall(Fn, {});
6917
6918 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6919 return;
6920
6921 Function *Kernel = Builder.GetInsertBlock()->getParent();
6922 // We need to strip the debug prefix to get the correct kernel name.
6923 StringRef KernelName = Kernel->getName();
6924 const std::string DebugPrefix = "_debug__";
6925 if (KernelName.ends_with(DebugPrefix))
6926 KernelName = KernelName.drop_back(DebugPrefix.length());
6927 auto *KernelEnvironmentGV =
6928 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6929 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6930 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6931 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6932 KernelEnvironmentInitializer,
6933 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6934 NewInitializer = ConstantFoldInsertValueInstruction(
6935 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6936 {0, 8});
6937 KernelEnvironmentGV->setInitializer(NewInitializer);
6938}
6939
6940static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6941 bool Min) {
6942 if (Kernel.hasFnAttribute(Name)) {
6943 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6944 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6945 }
6946 Kernel.addFnAttr(Name, llvm::utostr(Value));
6947}
6948
6949std::pair<int32_t, int32_t>
6950OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6951 int32_t ThreadLimit =
6952 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6953
6954 if (T.isAMDGPU()) {
6955 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6956 if (!Attr.isValid() || !Attr.isStringAttribute())
6957 return {0, ThreadLimit};
6958 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6959 int32_t LB, UB;
6960 if (!llvm::to_integer(UBStr, UB, 10))
6961 return {0, ThreadLimit};
6962 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6963 if (!llvm::to_integer(LBStr, LB, 10))
6964 return {0, UB};
6965 return {LB, UB};
6966 }
6967
6968 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6969 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6970 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6971 }
6972 return {0, ThreadLimit};
6973}
6974
6975void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6976 Function &Kernel, int32_t LB,
6977 int32_t UB) {
6978 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6979
6980 if (T.isAMDGPU()) {
6981 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6982 llvm::utostr(LB) + "," + llvm::utostr(UB));
6983 return;
6984 }
6985
6986 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6987}
6988
6989std::pair<int32_t, int32_t>
6990OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6991 // TODO: Read from backend annotations if available.
6992 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6993}
6994
6995void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
6996 int32_t LB, int32_t UB) {
6997 if (T.isNVPTX())
6998 if (UB > 0)
6999 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7000 if (T.isAMDGPU())
7001 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7002
7003 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7004}
7005
7006void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7007 Function *OutlinedFn) {
7008 if (Config.isTargetDevice()) {
7010 // TODO: Determine if DSO local can be set to true.
7011 OutlinedFn->setDSOLocal(false);
7013 if (T.isAMDGCN())
7015 else if (T.isNVPTX())
7017 else if (T.isSPIRV())
7019 }
7020}
7021
7022Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7023 StringRef EntryFnIDName) {
7024 if (Config.isTargetDevice()) {
7025 assert(OutlinedFn && "The outlined function must exist if embedded");
7026 return OutlinedFn;
7027 }
7028
7029 return new GlobalVariable(
7030 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7031 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7032}
7033
7034Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7035 StringRef EntryFnName) {
7036 if (OutlinedFn)
7037 return OutlinedFn;
7038
7039 assert(!M.getGlobalVariable(EntryFnName, true) &&
7040 "Named kernel already exists?");
7041 return new GlobalVariable(
7042 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7043 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7044}
7045
7046Error OpenMPIRBuilder::emitTargetRegionFunction(
7047 TargetRegionEntryInfo &EntryInfo,
7048 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7049 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7050
7051 SmallString<64> EntryFnName;
7052 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7053
7054 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7055 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7056 if (!CBResult)
7057 return CBResult.takeError();
7058 OutlinedFn = *CBResult;
7059 } else {
7060 OutlinedFn = nullptr;
7061 }
7062
7063 // If this target outline function is not an offload entry, we don't need to
7064 // register it. This may be in the case of a false if clause, or if there are
7065 // no OpenMP targets.
7066 if (!IsOffloadEntry)
7067 return Error::success();
7068
7069 std::string EntryFnIDName =
7070 Config.isTargetDevice()
7071 ? std::string(EntryFnName)
7072 : createPlatformSpecificName({EntryFnName, "region_id"});
7073
7074 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7075 EntryFnName, EntryFnIDName);
7076 return Error::success();
7077}
7078
7079Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7080 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7081 StringRef EntryFnName, StringRef EntryFnIDName) {
7082 if (OutlinedFn)
7083 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7084 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7085 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7086 OffloadInfoManager.registerTargetRegionEntryInfo(
7087 EntryInfo, EntryAddr, OutlinedFnID,
7088 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7089 return OutlinedFnID;
7090}
7091
7092OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7093 const LocationDescription &Loc, InsertPointTy AllocaIP,
7094 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7095 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7096 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7097 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7098 BodyGenTy BodyGenType)>
7099 BodyGenCB,
7100 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7101 if (!updateToLocation(Loc))
7102 return InsertPointTy();
7103
7104 Builder.restoreIP(CodeGenIP);
7105 // Disable TargetData CodeGen on Device pass.
7106 if (Config.IsTargetDevice.value_or(false)) {
7107 if (BodyGenCB) {
7108 InsertPointOrErrorTy AfterIP =
7109 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7110 if (!AfterIP)
7111 return AfterIP.takeError();
7112 Builder.restoreIP(*AfterIP);
7113 }
7114 return Builder.saveIP();
7115 }
7116
7117 bool IsStandAlone = !BodyGenCB;
7118 MapInfosTy *MapInfo;
7119 // Generate the code for the opening of the data environment. Capture all the
7120 // arguments of the runtime call by reference because they are used in the
7121 // closing of the region.
7122 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7123 InsertPointTy CodeGenIP) -> Error {
7124 MapInfo = &GenMapInfoCB(Builder.saveIP());
7125 if (Error Err = emitOffloadingArrays(
7126 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7127 /*IsNonContiguous=*/true, DeviceAddrCB))
7128 return Err;
7129
7130 TargetDataRTArgs RTArgs;
7131 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7132
7133 // Emit the number of elements in the offloading arrays.
7134 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7135
7136 // Source location for the ident struct
7137 if (!SrcLocInfo) {
7138 uint32_t SrcLocStrSize;
7139 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7140 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7141 }
7142
7143 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7144 SrcLocInfo, DeviceID,
7145 PointerNum, RTArgs.BasePointersArray,
7146 RTArgs.PointersArray, RTArgs.SizesArray,
7147 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7148 RTArgs.MappersArray};
7149
7150 if (IsStandAlone) {
7151 assert(MapperFunc && "MapperFunc missing for standalone target data");
7152
7153 auto TaskBodyCB = [&](Value *, Value *,
7155 if (Info.HasNoWait) {
7156 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7160 }
7161
7162 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7163 OffloadingArgs);
7164
7165 if (Info.HasNoWait) {
7166 BasicBlock *OffloadContBlock =
7167 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7168 Function *CurFn = Builder.GetInsertBlock()->getParent();
7169 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7170 Builder.restoreIP(Builder.saveIP());
7171 }
7172 return Error::success();
7173 };
7174
7175 bool RequiresOuterTargetTask = Info.HasNoWait;
7176 if (!RequiresOuterTargetTask)
7177 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7178 /*TargetTaskAllocaIP=*/{}));
7179 else
7180 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7181 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7182 } else {
7183 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7184 omp::OMPRTL___tgt_target_data_begin_mapper);
7185
7186 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7187
7188 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7189 if (isa<AllocaInst>(DeviceMap.second.second)) {
7190 auto *LI =
7191 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7192 Builder.CreateStore(LI, DeviceMap.second.second);
7193 }
7194 }
7195
7196 // If device pointer privatization is required, emit the body of the
7197 // region here. It will have to be duplicated: with and without
7198 // privatization.
7199 InsertPointOrErrorTy AfterIP =
7200 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7201 if (!AfterIP)
7202 return AfterIP.takeError();
7203 Builder.restoreIP(*AfterIP);
7204 }
7205 return Error::success();
7206 };
7207
7208 // If we need device pointer privatization, we need to emit the body of the
7209 // region with no privatization in the 'else' branch of the conditional.
7210 // Otherwise, we don't have to do anything.
7211 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7212 InsertPointTy CodeGenIP) -> Error {
7213 InsertPointOrErrorTy AfterIP =
7214 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7215 if (!AfterIP)
7216 return AfterIP.takeError();
7217 Builder.restoreIP(*AfterIP);
7218 return Error::success();
7219 };
7220
7221 // Generate code for the closing of the data region.
7222 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7223 TargetDataRTArgs RTArgs;
7224 Info.EmitDebug = !MapInfo->Names.empty();
7225 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7226
7227 // Emit the number of elements in the offloading arrays.
7228 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7229
7230 // Source location for the ident struct
7231 if (!SrcLocInfo) {
7232 uint32_t SrcLocStrSize;
7233 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7234 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7235 }
7236
7237 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7238 PointerNum, RTArgs.BasePointersArray,
7239 RTArgs.PointersArray, RTArgs.SizesArray,
7240 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7241 RTArgs.MappersArray};
7242 Function *EndMapperFunc =
7243 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7244
7245 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7246 return Error::success();
7247 };
7248
7249 // We don't have to do anything to close the region if the if clause evaluates
7250 // to false.
7251 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7252 return Error::success();
7253 };
7254
7255 Error Err = [&]() -> Error {
7256 if (BodyGenCB) {
7257 Error Err = [&]() {
7258 if (IfCond)
7259 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7260 return BeginThenGen(AllocaIP, Builder.saveIP());
7261 }();
7262
7263 if (Err)
7264 return Err;
7265
7266 // If we don't require privatization of device pointers, we emit the body
7267 // in between the runtime calls. This avoids duplicating the body code.
7268 InsertPointOrErrorTy AfterIP =
7269 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7270 if (!AfterIP)
7271 return AfterIP.takeError();
7272 restoreIPandDebugLoc(Builder, *AfterIP);
7273
7274 if (IfCond)
7275 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7276 return EndThenGen(AllocaIP, Builder.saveIP());
7277 }
7278 if (IfCond)
7279 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7280 return BeginThenGen(AllocaIP, Builder.saveIP());
7281 }();
7282
7283 if (Err)
7284 return Err;
7285
7286 return Builder.saveIP();
7287}
7288
7290OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7291 bool IsGPUDistribute) {
7292 assert((IVSize == 32 || IVSize == 64) &&
7293 "IV size is not compatible with the omp runtime");
7295 if (IsGPUDistribute)
7296 Name = IVSize == 32
7297 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7298 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7299 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7300 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7301 else
7302 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7303 : omp::OMPRTL___kmpc_for_static_init_4u)
7304 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7305 : omp::OMPRTL___kmpc_for_static_init_8u);
7306
7307 return getOrCreateRuntimeFunction(M, Name);
7308}
7309
7310FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7311 bool IVSigned) {
7312 assert((IVSize == 32 || IVSize == 64) &&
7313 "IV size is not compatible with the omp runtime");
7314 RuntimeFunction Name = IVSize == 32
7315 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7316 : omp::OMPRTL___kmpc_dispatch_init_4u)
7317 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7318 : omp::OMPRTL___kmpc_dispatch_init_8u);
7319
7320 return getOrCreateRuntimeFunction(M, Name);
7321}
7322
7323FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7324 bool IVSigned) {
7325 assert((IVSize == 32 || IVSize == 64) &&
7326 "IV size is not compatible with the omp runtime");
7327 RuntimeFunction Name = IVSize == 32
7328 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7329 : omp::OMPRTL___kmpc_dispatch_next_4u)
7330 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7331 : omp::OMPRTL___kmpc_dispatch_next_8u);
7332
7333 return getOrCreateRuntimeFunction(M, Name);
7334}
7335
7336FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7337 bool IVSigned) {
7338 assert((IVSize == 32 || IVSize == 64) &&
7339 "IV size is not compatible with the omp runtime");
7340 RuntimeFunction Name = IVSize == 32
7341 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7342 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7343 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7344 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7345
7346 return getOrCreateRuntimeFunction(M, Name);
7347}
7348
7349FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7350 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7351}
7352
7354 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7355 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7356
7357 DISubprogram *NewSP = Func->getSubprogram();
7358 if (!NewSP)
7359 return;
7360
7362
7363 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7364 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7365 // Only use cached variable if the arg number matches. This is important
7366 // so that DIVariable created for privatized variables are not discarded.
7367 if (NewVar && (arg == NewVar->getArg()))
7368 return NewVar;
7369
7371 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7372 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7373 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7374 return NewVar;
7375 };
7376
7377 auto UpdateDebugRecord = [&](auto *DR) {
7378 DILocalVariable *OldVar = DR->getVariable();
7379 unsigned ArgNo = 0;
7380 for (auto Loc : DR->location_ops()) {
7381 auto Iter = ValueReplacementMap.find(Loc);
7382 if (Iter != ValueReplacementMap.end()) {
7383 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7384 ArgNo = std::get<1>(Iter->second) + 1;
7385 }
7386 }
7387 if (ArgNo != 0)
7388 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7389 };
7390
7391 // The location and scope of variable intrinsics and records still point to
7392 // the parent function of the target region. Update them.
7393 for (Instruction &I : instructions(Func)) {
7395 "Unexpected debug intrinsic");
7396 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7397 UpdateDebugRecord(&DVR);
7398 }
7399 // An extra argument is passed to the device. Create the debug data for it.
7400 if (OMPBuilder.Config.isTargetDevice()) {
7401 DICompileUnit *CU = NewSP->getUnit();
7402 Module *M = Func->getParent();
7403 DIBuilder DB(*M, true, CU);
7404 DIType *VoidPtrTy =
7405 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7406 DILocalVariable *Var = DB.createParameterVariable(
7407 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7408 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7409 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7410 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7411 &(*Func->begin()));
7412 }
7413}
7414
7416 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7417 return cast<Operator>(V)->getOperand(0);
7418 return V;
7419}
7420
7422 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7423 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7424 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7425 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7426 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7427 SmallVector<Type *> ParameterTypes;
7428 if (OMPBuilder.Config.isTargetDevice()) {
7429 // Add the "implicit" runtime argument we use to provide launch specific
7430 // information for target devices.
7431 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7432 ParameterTypes.push_back(Int8PtrTy);
7433
7434 // All parameters to target devices are passed as pointers
7435 // or i64. This assumes 64-bit address spaces/pointers.
7436 for (auto &Arg : Inputs)
7437 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7438 ? Arg->getType()
7439 : Type::getInt64Ty(Builder.getContext()));
7440 } else {
7441 for (auto &Arg : Inputs)
7442 ParameterTypes.push_back(Arg->getType());
7443 }
7444
7445 auto BB = Builder.GetInsertBlock();
7446 auto M = BB->getModule();
7447 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7448 /*isVarArg*/ false);
7449 auto Func =
7450 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7451
7452 // Forward target-cpu and target-features function attributes from the
7453 // original function to the new outlined function.
7454 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7455
7456 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7457 if (TargetCpuAttr.isStringAttribute())
7458 Func->addFnAttr(TargetCpuAttr);
7459
7460 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7461 if (TargetFeaturesAttr.isStringAttribute())
7462 Func->addFnAttr(TargetFeaturesAttr);
7463
7464 if (OMPBuilder.Config.isTargetDevice()) {
7465 Value *ExecMode =
7466 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7467 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7468 }
7469
7470 // Save insert point.
7471 IRBuilder<>::InsertPointGuard IPG(Builder);
7472 // We will generate the entries in the outlined function but the debug
7473 // location may still be pointing to the parent function. Reset it now.
7474 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7475
7476 // Generate the region into the function.
7477 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7478 Builder.SetInsertPoint(EntryBB);
7479
7480 // Insert target init call in the device compilation pass.
7481 if (OMPBuilder.Config.isTargetDevice())
7482 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7483
7484 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7485
7486 // As we embed the user code in the middle of our target region after we
7487 // generate entry code, we must move what allocas we can into the entry
7488 // block to avoid possible breaking optimisations for device
7489 if (OMPBuilder.Config.isTargetDevice())
7490 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7491
7492 // Insert target deinit call in the device compilation pass.
7493 BasicBlock *OutlinedBodyBB =
7494 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7495 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7496 Builder.saveIP(),
7497 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7498 if (!AfterIP)
7499 return AfterIP.takeError();
7500 Builder.restoreIP(*AfterIP);
7501 if (OMPBuilder.Config.isTargetDevice())
7502 OMPBuilder.createTargetDeinit(Builder);
7503
7504 // Insert return instruction.
7505 Builder.CreateRetVoid();
7506
7507 // New Alloca IP at entry point of created device function.
7508 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7509 auto AllocaIP = Builder.saveIP();
7510
7511 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7512
7513 // Skip the artificial dyn_ptr on the device.
7514 const auto &ArgRange =
7515 OMPBuilder.Config.isTargetDevice()
7516 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7517 : Func->args();
7518
7520
7521 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7522 // Things like GEP's can come in the form of Constants. Constants and
7523 // ConstantExpr's do not have access to the knowledge of what they're
7524 // contained in, so we must dig a little to find an instruction so we
7525 // can tell if they're used inside of the function we're outlining. We
7526 // also replace the original constant expression with a new instruction
7527 // equivalent; an instruction as it allows easy modification in the
7528 // following loop, as we can now know the constant (instruction) is
7529 // owned by our target function and replaceUsesOfWith can now be invoked
7530 // on it (cannot do this with constants it seems). A brand new one also
7531 // allows us to be cautious as it is perhaps possible the old expression
7532 // was used inside of the function but exists and is used externally
7533 // (unlikely by the nature of a Constant, but still).
7534 // NOTE: We cannot remove dead constants that have been rewritten to
7535 // instructions at this stage, we run the risk of breaking later lowering
7536 // by doing so as we could still be in the process of lowering the module
7537 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7538 // constants we have created rewritten versions of.
7539 if (auto *Const = dyn_cast<Constant>(Input))
7540 convertUsersOfConstantsToInstructions(Const, Func, false);
7541
7542 // Collect users before iterating over them to avoid invalidating the
7543 // iteration in case a user uses Input more than once (e.g. a call
7544 // instruction).
7545 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7546 // Collect all the instructions
7548 if (auto *Instr = dyn_cast<Instruction>(User))
7549 if (Instr->getFunction() == Func)
7550 Instr->replaceUsesOfWith(Input, InputCopy);
7551 };
7552
7553 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7554
7555 // Rewrite uses of input valus to parameters.
7556 for (auto InArg : zip(Inputs, ArgRange)) {
7557 Value *Input = std::get<0>(InArg);
7558 Argument &Arg = std::get<1>(InArg);
7559 Value *InputCopy = nullptr;
7560
7561 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7562 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7563 if (!AfterIP)
7564 return AfterIP.takeError();
7565 Builder.restoreIP(*AfterIP);
7566 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7567
7568 // In certain cases a Global may be set up for replacement, however, this
7569 // Global may be used in multiple arguments to the kernel, just segmented
7570 // apart, for example, if we have a global array, that is sectioned into
7571 // multiple mappings (technically not legal in OpenMP, but there is a case
7572 // in Fortran for Common Blocks where this is neccesary), we will end up
7573 // with GEP's into this array inside the kernel, that refer to the Global
7574 // but are technically seperate arguments to the kernel for all intents and
7575 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7576 // index, it will fold into an referal to the Global, if we then encounter
7577 // this folded GEP during replacement all of the references to the
7578 // Global in the kernel will be replaced with the argument we have generated
7579 // that corresponds to it, including any other GEP's that refer to the
7580 // Global that may be other arguments. This will invalidate all of the other
7581 // preceding mapped arguments that refer to the same global that may be
7582 // seperate segments. To prevent this, we defer global processing until all
7583 // other processing has been performed.
7586 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7587 continue;
7588 }
7589
7591 continue;
7592
7593 ReplaceValue(Input, InputCopy, Func);
7594 }
7595
7596 // Replace all of our deferred Input values, currently just Globals.
7597 for (auto Deferred : DeferredReplacement)
7598 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7599
7600 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7601 ValueReplacementMap);
7602 return Func;
7603}
7604/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7605/// of pointers containing shared data between the parent task and the created
7606/// task.
7607static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7608 IRBuilderBase &Builder,
7609 Value *TaskWithPrivates,
7610 Type *TaskWithPrivatesTy) {
7611
7612 Type *TaskTy = OMPIRBuilder.Task;
7613 LLVMContext &Ctx = Builder.getContext();
7614 Value *TaskT =
7615 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7616 Value *Shareds = TaskT;
7617 // TaskWithPrivatesTy can be one of the following
7618 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7619 // %struct.privates }
7620 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7621 //
7622 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7623 // its first member has to be the task descriptor. TaskTy is the type of the
7624 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7625 // first member of TaskT, gives us the pointer to shared data.
7626 if (TaskWithPrivatesTy != TaskTy)
7627 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7628 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7629}
7630/// Create an entry point for a target task with the following.
7631/// It'll have the following signature
7632/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7633/// This function is called from emitTargetTask once the
7634/// code to launch the target kernel has been outlined already.
7635/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7636/// into the task structure so that the deferred target task can access this
7637/// data even after the stack frame of the generating task has been rolled
7638/// back. Offloading arrays contain base pointers, pointers, sizes etc
7639/// of the data that the target kernel will access. These in effect are the
7640/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7642 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7643 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7644 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7645
7646 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7647 // This is because PrivatesTy is the type of the structure in which
7648 // we pass the offloading arrays to the deferred target task.
7649 assert((!NumOffloadingArrays || PrivatesTy) &&
7650 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7651 "to privatize");
7652
7653 Module &M = OMPBuilder.M;
7654 // KernelLaunchFunction is the target launch function, i.e.
7655 // the function that sets up kernel arguments and calls
7656 // __tgt_target_kernel to launch the kernel on the device.
7657 //
7658 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7659
7660 // StaleCI is the CallInst which is the call to the outlined
7661 // target kernel launch function. If there are local live-in values
7662 // that the outlined function uses then these are aggregated into a structure
7663 // which is passed as the second argument. If there are no local live-in
7664 // values or if all values used by the outlined kernel are global variables,
7665 // then there's only one argument, the threadID. So, StaleCI can be
7666 //
7667 // %structArg = alloca { ptr, ptr }, align 8
7668 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7669 // store ptr %20, ptr %gep_, align 8
7670 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7671 // store ptr %21, ptr %gep_8, align 8
7672 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7673 //
7674 // OR
7675 //
7676 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7677 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7678 StaleCI->getIterator());
7679
7680 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7681
7682 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7683 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7684 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7685
7686 auto ProxyFnTy =
7687 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7688 /* isVarArg */ false);
7689 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7690 ".omp_target_task_proxy_func",
7691 Builder.GetInsertBlock()->getModule());
7692 Value *ThreadId = ProxyFn->getArg(0);
7693 Value *TaskWithPrivates = ProxyFn->getArg(1);
7694 ThreadId->setName("thread.id");
7695 TaskWithPrivates->setName("task");
7696
7697 bool HasShareds = SharedArgsOperandNo > 0;
7698 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7699 BasicBlock *EntryBB =
7700 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7701 Builder.SetInsertPoint(EntryBB);
7702
7703 SmallVector<Value *> KernelLaunchArgs;
7704 KernelLaunchArgs.reserve(StaleCI->arg_size());
7705 KernelLaunchArgs.push_back(ThreadId);
7706
7707 if (HasOffloadingArrays) {
7708 assert(TaskTy != TaskWithPrivatesTy &&
7709 "If there are offloading arrays to pass to the target"
7710 "TaskTy cannot be the same as TaskWithPrivatesTy");
7711 (void)TaskTy;
7712 Value *Privates =
7713 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7714 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7715 KernelLaunchArgs.push_back(
7716 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7717 }
7718
7719 if (HasShareds) {
7720 auto *ArgStructAlloca =
7721 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7722 assert(ArgStructAlloca &&
7723 "Unable to find the alloca instruction corresponding to arguments "
7724 "for extracted function");
7725 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7726
7727 AllocaInst *NewArgStructAlloca =
7728 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7729
7730 Value *SharedsSize =
7731 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7732
7734 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7735
7736 Builder.CreateMemCpy(
7737 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7738 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7739 KernelLaunchArgs.push_back(NewArgStructAlloca);
7740 }
7741 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7742 Builder.CreateRetVoid();
7743 return ProxyFn;
7744}
7746
7747 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7748 return GEP->getSourceElementType();
7749 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7750 return Alloca->getAllocatedType();
7751
7752 llvm_unreachable("Unhandled Instruction type");
7753 return nullptr;
7754}
7755// This function returns a struct that has at most two members.
7756// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7757// descriptor. The second member, if needed, is a struct containing arrays
7758// that need to be passed to the offloaded target kernel. For example,
7759// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7760// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7761// respectively, then the types created by this function are
7762//
7763// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7764// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7765// %struct.privates }
7766// %struct.task_with_privates is returned by this function.
7767// If there aren't any offloading arrays to pass to the target kernel,
7768// %struct.kmp_task_ompbuilder_t is returned.
7769static StructType *
7770createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7771 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7772
7773 if (OffloadingArraysToPrivatize.empty())
7774 return OMPIRBuilder.Task;
7775
7776 SmallVector<Type *, 4> StructFieldTypes;
7777 for (Value *V : OffloadingArraysToPrivatize) {
7778 assert(V->getType()->isPointerTy() &&
7779 "Expected pointer to array to privatize. Got a non-pointer value "
7780 "instead");
7781 Type *ArrayTy = getOffloadingArrayType(V);
7782 assert(ArrayTy && "ArrayType cannot be nullptr");
7783 StructFieldTypes.push_back(ArrayTy);
7784 }
7785 StructType *PrivatesStructTy =
7786 StructType::create(StructFieldTypes, "struct.privates");
7787 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7788 "struct.task_with_privates");
7789}
7791 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7792 TargetRegionEntryInfo &EntryInfo,
7793 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7794 Function *&OutlinedFn, Constant *&OutlinedFnID,
7796 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7797 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7798
7799 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7800 [&](StringRef EntryFnName) {
7801 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7802 EntryFnName, Inputs, CBFunc,
7803 ArgAccessorFuncCB);
7804 };
7805
7806 return OMPBuilder.emitTargetRegionFunction(
7807 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7808 OutlinedFnID);
7809}
7810
7811OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7812 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7813 OpenMPIRBuilder::InsertPointTy AllocaIP,
7815 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7816
7817 // The following explains the code-gen scenario for the `target` directive. A
7818 // similar scneario is followed for other device-related directives (e.g.
7819 // `target enter data`) but in similar fashion since we only need to emit task
7820 // that encapsulates the proper runtime call.
7821 //
7822 // When we arrive at this function, the target region itself has been
7823 // outlined into the function OutlinedFn.
7824 // So at ths point, for
7825 // --------------------------------------------------------------
7826 // void user_code_that_offloads(...) {
7827 // omp target depend(..) map(from:a) map(to:b) private(i)
7828 // do i = 1, 10
7829 // a(i) = b(i) + n
7830 // }
7831 //
7832 // --------------------------------------------------------------
7833 //
7834 // we have
7835 //
7836 // --------------------------------------------------------------
7837 //
7838 // void user_code_that_offloads(...) {
7839 // %.offload_baseptrs = alloca [2 x ptr], align 8
7840 // %.offload_ptrs = alloca [2 x ptr], align 8
7841 // %.offload_mappers = alloca [2 x ptr], align 8
7842 // ;; target region has been outlined and now we need to
7843 // ;; offload to it via a target task.
7844 // }
7845 // void outlined_device_function(ptr a, ptr b, ptr n) {
7846 // n = *n_ptr;
7847 // do i = 1, 10
7848 // a(i) = b(i) + n
7849 // }
7850 //
7851 // We have to now do the following
7852 // (i) Make an offloading call to outlined_device_function using the OpenMP
7853 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7854 // emitted by emitKernelLaunch
7855 // (ii) Create a task entry point function that calls kernel_launch_function
7856 // and is the entry point for the target task. See
7857 // '@.omp_target_task_proxy_func in the pseudocode below.
7858 // (iii) Create a task with the task entry point created in (ii)
7859 //
7860 // That is we create the following
7861 // struct task_with_privates {
7862 // struct kmp_task_ompbuilder_t task_struct;
7863 // struct privates {
7864 // [2 x ptr] ; baseptrs
7865 // [2 x ptr] ; ptrs
7866 // [2 x i64] ; sizes
7867 // }
7868 // }
7869 // void user_code_that_offloads(...) {
7870 // %.offload_baseptrs = alloca [2 x ptr], align 8
7871 // %.offload_ptrs = alloca [2 x ptr], align 8
7872 // %.offload_sizes = alloca [2 x i64], align 8
7873 //
7874 // %structArg = alloca { ptr, ptr, ptr }, align 8
7875 // %strucArg[0] = a
7876 // %strucArg[1] = b
7877 // %strucArg[2] = &n
7878 //
7879 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7880 // sizeof(kmp_task_ompbuilder_t),
7881 // sizeof(structArg),
7882 // @.omp_target_task_proxy_func,
7883 // ...)
7884 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7885 // sizeof(structArg))
7886 // memcpy(target_task_with_privates->privates->baseptrs,
7887 // offload_baseptrs, sizeof(offload_baseptrs)
7888 // memcpy(target_task_with_privates->privates->ptrs,
7889 // offload_ptrs, sizeof(offload_ptrs)
7890 // memcpy(target_task_with_privates->privates->sizes,
7891 // offload_sizes, sizeof(offload_sizes)
7892 // dependencies_array = ...
7893 // ;; if nowait not present
7894 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7895 // call @__kmpc_omp_task_begin_if0(...)
7896 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7897 // %target_task_with_privates)
7898 // call @__kmpc_omp_task_complete_if0(...)
7899 // }
7900 //
7901 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7902 // ptr %task) {
7903 // %structArg = alloca {ptr, ptr, ptr}
7904 // %task_ptr = getelementptr(%task, 0, 0)
7905 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7906 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7907 //
7908 // %offloading_arrays = getelementptr(%task, 0, 1)
7909 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7910 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7911 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7912 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7913 // %offload_sizes, %structArg)
7914 // }
7915 //
7916 // We need the proxy function because the signature of the task entry point
7917 // expected by kmpc_omp_task is always the same and will be different from
7918 // that of the kernel_launch function.
7919 //
7920 // kernel_launch_function is generated by emitKernelLaunch and has the
7921 // always_inline attribute. For this example, it'll look like so:
7922 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7923 // %offload_sizes, %structArg) alwaysinline {
7924 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7925 // ; load aggregated data from %structArg
7926 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7927 // ; offload_sizes
7928 // call i32 @__tgt_target_kernel(...,
7929 // outlined_device_function,
7930 // ptr %kernel_args)
7931 // }
7932 // void outlined_device_function(ptr a, ptr b, ptr n) {
7933 // n = *n_ptr;
7934 // do i = 1, 10
7935 // a(i) = b(i) + n
7936 // }
7937 //
7938 BasicBlock *TargetTaskBodyBB =
7939 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7940 BasicBlock *TargetTaskAllocaBB =
7941 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7942
7943 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7944 TargetTaskAllocaBB->begin());
7945 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7946
7947 OutlineInfo OI;
7948 OI.EntryBB = TargetTaskAllocaBB;
7949 OI.OuterAllocaBB = AllocaIP.getBlock();
7950
7951 // Add the thread ID argument.
7953 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7954 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7955
7956 // Generate the task body which will subsequently be outlined.
7957 Builder.restoreIP(TargetTaskBodyIP);
7958 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7959 return Err;
7960
7961 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7962 // it is given. These blocks are enumerated by
7963 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7964 // to be outside the region. In other words, OI.ExitBlock is expected to be
7965 // the start of the region after the outlining. We used to set OI.ExitBlock
7966 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7967 // except when the task body is a single basic block. In that case,
7968 // OI.ExitBlock is set to the single task body block and will get left out of
7969 // the outlining process. So, simply create a new empty block to which we
7970 // uncoditionally branch from where TaskBodyCB left off
7971 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7972 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7973 /*IsFinished=*/true);
7974
7975 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7976 bool NeedsTargetTask = HasNoWait && DeviceID;
7977 if (NeedsTargetTask) {
7978 for (auto *V :
7979 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7980 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7981 RTArgs.SizesArray}) {
7983 OffloadingArraysToPrivatize.push_back(V);
7984 OI.ExcludeArgsFromAggregate.push_back(V);
7985 }
7986 }
7987 }
7988 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7989 DeviceID, OffloadingArraysToPrivatize](
7990 Function &OutlinedFn) mutable {
7991 assert(OutlinedFn.hasOneUse() &&
7992 "there must be a single user for the outlined function");
7993
7994 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7995
7996 // The first argument of StaleCI is always the thread id.
7997 // The next few arguments are the pointers to offloading arrays
7998 // if any. (see OffloadingArraysToPrivatize)
7999 // Finally, all other local values that are live-in into the outlined region
8000 // end up in a structure whose pointer is passed as the last argument. This
8001 // piece of data is passed in the "shared" field of the task structure. So,
8002 // we know we have to pass shareds to the task if the number of arguments is
8003 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8004 // thread id. Further, for safety, we assert that the number of arguments of
8005 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8006 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8007 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8008 assert((!HasShareds ||
8009 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8010 "Wrong number of arguments for StaleCI when shareds are present");
8011 int SharedArgOperandNo =
8012 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8013
8014 StructType *TaskWithPrivatesTy =
8015 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8016 StructType *PrivatesTy = nullptr;
8017
8018 if (!OffloadingArraysToPrivatize.empty())
8019 PrivatesTy =
8020 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8021
8023 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8024 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8025
8026 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8027 << "\n");
8028
8029 Builder.SetInsertPoint(StaleCI);
8030
8031 // Gather the arguments for emitting the runtime call.
8032 uint32_t SrcLocStrSize;
8033 Constant *SrcLocStr =
8034 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8035 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8036
8037 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8038 //
8039 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8040 // the DeviceID to the deferred task and also since
8041 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8042 Function *TaskAllocFn =
8043 !NeedsTargetTask
8044 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8045 : getOrCreateRuntimeFunctionPtr(
8046 OMPRTL___kmpc_omp_target_task_alloc);
8047
8048 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8049 // call.
8050 Value *ThreadID = getOrCreateThreadID(Ident);
8051
8052 // Argument - `sizeof_kmp_task_t` (TaskSize)
8053 // Tasksize refers to the size in bytes of kmp_task_t data structure
8054 // plus any other data to be passed to the target task, if any, which
8055 // is packed into a struct. kmp_task_t and the struct so created are
8056 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8057 Value *TaskSize = Builder.getInt64(
8058 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8059
8060 // Argument - `sizeof_shareds` (SharedsSize)
8061 // SharedsSize refers to the shareds array size in the kmp_task_t data
8062 // structure.
8063 Value *SharedsSize = Builder.getInt64(0);
8064 if (HasShareds) {
8065 auto *ArgStructAlloca =
8066 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8067 assert(ArgStructAlloca &&
8068 "Unable to find the alloca instruction corresponding to arguments "
8069 "for extracted function");
8070 auto *ArgStructType =
8071 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8072 assert(ArgStructType && "Unable to find struct type corresponding to "
8073 "arguments for extracted function");
8074 SharedsSize =
8075 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8076 }
8077
8078 // Argument - `flags`
8079 // Task is tied iff (Flags & 1) == 1.
8080 // Task is untied iff (Flags & 1) == 0.
8081 // Task is final iff (Flags & 2) == 2.
8082 // Task is not final iff (Flags & 2) == 0.
8083 // A target task is not final and is untied.
8084 Value *Flags = Builder.getInt32(0);
8085
8086 // Emit the @__kmpc_omp_task_alloc runtime call
8087 // The runtime call returns a pointer to an area where the task captured
8088 // variables must be copied before the task is run (TaskData)
8089 CallInst *TaskData = nullptr;
8090
8091 SmallVector<llvm::Value *> TaskAllocArgs = {
8092 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8093 /*flags=*/Flags,
8094 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8095 /*task_func=*/ProxyFn};
8096
8097 if (NeedsTargetTask) {
8098 assert(DeviceID && "Expected non-empty device ID.");
8099 TaskAllocArgs.push_back(DeviceID);
8100 }
8101
8102 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8103
8104 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8105 if (HasShareds) {
8106 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8108 *this, Builder, TaskData, TaskWithPrivatesTy);
8109 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8110 SharedsSize);
8111 }
8112 if (!OffloadingArraysToPrivatize.empty()) {
8113 Value *Privates =
8114 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8115 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8116 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8117 [[maybe_unused]] Type *ArrayType =
8118 getOffloadingArrayType(PtrToPrivatize);
8119 assert(ArrayType && "ArrayType cannot be nullptr");
8120
8121 Type *ElementType = PrivatesTy->getElementType(i);
8122 assert(ElementType == ArrayType &&
8123 "ElementType should match ArrayType");
8124 (void)ArrayType;
8125
8126 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8127 Builder.CreateMemCpy(
8128 Dst, Alignment, PtrToPrivatize, Alignment,
8129 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8130 }
8131 }
8132
8133 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8134
8135 // ---------------------------------------------------------------
8136 // V5.2 13.8 target construct
8137 // If the nowait clause is present, execution of the target task
8138 // may be deferred. If the nowait clause is not present, the target task is
8139 // an included task.
8140 // ---------------------------------------------------------------
8141 // The above means that the lack of a nowait on the target construct
8142 // translates to '#pragma omp task if(0)'
8143 if (!NeedsTargetTask) {
8144 if (DepArray) {
8145 Function *TaskWaitFn =
8146 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8147 Builder.CreateCall(
8148 TaskWaitFn,
8149 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8150 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8151 /*dep_list=*/DepArray,
8152 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8153 /*noalias_dep_list=*/
8155 }
8156 // Included task.
8157 Function *TaskBeginFn =
8158 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8159 Function *TaskCompleteFn =
8160 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8161 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8162 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8163 CI->setDebugLoc(StaleCI->getDebugLoc());
8164 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8165 } else if (DepArray) {
8166 // HasNoWait - meaning the task may be deferred. Call
8167 // __kmpc_omp_task_with_deps if there are dependencies,
8168 // else call __kmpc_omp_task
8169 Function *TaskFn =
8170 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8171 Builder.CreateCall(
8172 TaskFn,
8173 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8174 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8176 } else {
8177 // Emit the @__kmpc_omp_task runtime call to spawn the task
8178 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8179 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8180 }
8181
8182 StaleCI->eraseFromParent();
8183 for (Instruction *I : llvm::reverse(ToBeDeleted))
8184 I->eraseFromParent();
8185 };
8186 addOutlineInfo(std::move(OI));
8187
8188 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8189 << *(Builder.GetInsertBlock()) << "\n");
8190 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8191 << *(Builder.GetInsertBlock()->getParent()->getParent())
8192 << "\n");
8193 return Builder.saveIP();
8194}
8195
8196Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8197 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8198 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8199 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8200 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8201 if (Error Err =
8202 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8203 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8204 return Err;
8205 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8206 return Error::success();
8207}
8208
8209static void emitTargetCall(
8210 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8211 OpenMPIRBuilder::InsertPointTy AllocaIP,
8212 OpenMPIRBuilder::TargetDataInfo &Info,
8213 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8214 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8215 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8217 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8218 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8220 bool HasNoWait) {
8221 // Generate a function call to the host fallback implementation of the target
8222 // region. This is called by the host when no offload entry was generated for
8223 // the target region and when the offloading call fails at runtime.
8224 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8225 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8226 Builder.restoreIP(IP);
8227 Builder.CreateCall(OutlinedFn, Args);
8228 return Builder.saveIP();
8229 };
8230
8231 bool HasDependencies = Dependencies.size() > 0;
8232 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8233
8234 OpenMPIRBuilder::TargetKernelArgs KArgs;
8235
8236 auto TaskBodyCB =
8237 [&](Value *DeviceID, Value *RTLoc,
8238 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8239 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8240 // produce any.
8241 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8242 // emitKernelLaunch makes the necessary runtime call to offload the
8243 // kernel. We then outline all that code into a separate function
8244 // ('kernel_launch_function' in the pseudo code above). This function is
8245 // then called by the target task proxy function (see
8246 // '@.omp_target_task_proxy_func' in the pseudo code above)
8247 // "@.omp_target_task_proxy_func' is generated by
8248 // emitTargetTaskProxyFunction.
8249 if (OutlinedFnID && DeviceID)
8250 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8251 EmitTargetCallFallbackCB, KArgs,
8252 DeviceID, RTLoc, TargetTaskAllocaIP);
8253
8254 // We only need to do the outlining if `DeviceID` is set to avoid calling
8255 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8256 // generating the `else` branch of an `if` clause.
8257 //
8258 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8259 // In this case, we execute the host implementation directly.
8260 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8261 }());
8262
8263 OMPBuilder.Builder.restoreIP(AfterIP);
8264 return Error::success();
8265 };
8266
8267 auto &&EmitTargetCallElse =
8268 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8269 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8270 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8271 // produce any.
8272 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8273 if (RequiresOuterTargetTask) {
8274 // Arguments that are intended to be directly forwarded to an
8275 // emitKernelLaunch call are pased as nullptr, since
8276 // OutlinedFnID=nullptr results in that call not being done.
8277 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8278 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8279 /*RTLoc=*/nullptr, AllocaIP,
8280 Dependencies, EmptyRTArgs, HasNoWait);
8281 }
8282 return EmitTargetCallFallbackCB(Builder.saveIP());
8283 }());
8284
8285 Builder.restoreIP(AfterIP);
8286 return Error::success();
8287 };
8288
8289 auto &&EmitTargetCallThen =
8290 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8291 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8292 Info.HasNoWait = HasNoWait;
8293 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8294 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8295 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8296 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8297 /*IsNonContiguous=*/true,
8298 /*ForEndCall=*/false))
8299 return Err;
8300
8301 SmallVector<Value *, 3> NumTeamsC;
8302 for (auto [DefaultVal, RuntimeVal] :
8303 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8304 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8305 : Builder.getInt32(DefaultVal));
8306
8307 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8308 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8309 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8310 if (Clause)
8311 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8312 /*isSigned=*/false);
8313 return Clause;
8314 };
8315 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8316 if (Clause)
8317 Result =
8318 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8319 Result, Clause)
8320 : Clause;
8321 };
8322
8323 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8324 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8325 SmallVector<Value *, 3> NumThreadsC;
8326 Value *MaxThreadsClause =
8327 RuntimeAttrs.TeamsThreadLimit.size() == 1
8328 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8329 : nullptr;
8330
8331 for (auto [TeamsVal, TargetVal] : zip_equal(
8332 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8333 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8334 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8335
8336 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8337 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8338
8339 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8340 }
8341
8342 unsigned NumTargetItems = Info.NumberOfPtrs;
8343 // TODO: Use correct device ID
8344 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8345 uint32_t SrcLocStrSize;
8346 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8347 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8348 llvm::omp::IdentFlag(0), 0);
8349
8350 Value *TripCount = RuntimeAttrs.LoopTripCount
8351 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8352 Builder.getInt64Ty(),
8353 /*isSigned=*/false)
8354 : Builder.getInt64(0);
8355
8356 // TODO: Use correct DynCGGroupMem
8357 Value *DynCGGroupMem = Builder.getInt32(0);
8358
8359 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8360 NumTeamsC, NumThreadsC,
8361 DynCGGroupMem, HasNoWait);
8362
8363 // Assume no error was returned because TaskBodyCB and
8364 // EmitTargetCallFallbackCB don't produce any.
8365 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8366 // The presence of certain clauses on the target directive require the
8367 // explicit generation of the target task.
8368 if (RequiresOuterTargetTask)
8369 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8370 Dependencies, KArgs.RTArgs,
8371 Info.HasNoWait);
8372
8373 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8374 EmitTargetCallFallbackCB, KArgs,
8375 DeviceID, RTLoc, AllocaIP);
8376 }());
8377
8378 Builder.restoreIP(AfterIP);
8379 return Error::success();
8380 };
8381
8382 // If we don't have an ID for the target region, it means an offload entry
8383 // wasn't created. In this case we just run the host fallback directly and
8384 // ignore any potential 'if' clauses.
8385 if (!OutlinedFnID) {
8386 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8387 return;
8388 }
8389
8390 // If there's no 'if' clause, only generate the kernel launch code path.
8391 if (!IfCond) {
8392 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8393 return;
8394 }
8395
8396 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8397 EmitTargetCallElse, AllocaIP));
8398}
8399
8400OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8401 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8402 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8403 TargetRegionEntryInfo &EntryInfo,
8404 const TargetKernelDefaultAttrs &DefaultAttrs,
8405 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8406 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8407 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8408 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8409 CustomMapperCallbackTy CustomMapperCB,
8410 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8411
8412 if (!updateToLocation(Loc))
8413 return InsertPointTy();
8414
8415 Builder.restoreIP(CodeGenIP);
8416
8417 Function *OutlinedFn;
8418 Constant *OutlinedFnID = nullptr;
8419 // The target region is outlined into its own function. The LLVM IR for
8420 // the target region itself is generated using the callbacks CBFunc
8421 // and ArgAccessorFuncCB
8423 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8424 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8425 return Err;
8426
8427 // If we are not on the target device, then we need to generate code
8428 // to make a remote call (offload) to the previously outlined function
8429 // that represents the target region. Do that now.
8430 if (!Config.isTargetDevice())
8431 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8432 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8433 CustomMapperCB, Dependencies, HasNowait);
8434 return Builder.saveIP();
8435}
8436
8437std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8438 StringRef FirstSeparator,
8439 StringRef Separator) {
8440 SmallString<128> Buffer;
8441 llvm::raw_svector_ostream OS(Buffer);
8442 StringRef Sep = FirstSeparator;
8443 for (StringRef Part : Parts) {
8444 OS << Sep << Part;
8445 Sep = Separator;
8446 }
8447 return OS.str().str();
8448}
8449
8450std::string
8451OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8452 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8453 Config.separator());
8454}
8455
8457OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8458 unsigned AddressSpace) {
8459 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8460 if (Elem.second) {
8461 assert(Elem.second->getValueType() == Ty &&
8462 "OMP internal variable has different type than requested");
8463 } else {
8464 // TODO: investigate the appropriate linkage type used for the global
8465 // variable for possibly changing that to internal or private, or maybe
8466 // create different versions of the function for different OMP internal
8467 // variables.
8468 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8471 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8472 Constant::getNullValue(Ty), Elem.first(),
8473 /*InsertBefore=*/nullptr,
8475 const DataLayout &DL = M.getDataLayout();
8476 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8477 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8478 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8479 Elem.second = GV;
8480 }
8481
8482 return Elem.second;
8483}
8484
8485Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8486 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8487 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8488 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8489}
8490
8491Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8492 LLVMContext &Ctx = Builder.getContext();
8493 Value *Null =
8495 Value *SizeGep =
8496 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8497 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8498 return SizePtrToInt;
8499}
8500
8502OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8503 std::string VarName) {
8504 llvm::Constant *MaptypesArrayInit =
8505 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8506 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8507 M, MaptypesArrayInit->getType(),
8508 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8509 VarName);
8510 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8511 return MaptypesArrayGlobal;
8512}
8513
8514void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8515 InsertPointTy AllocaIP,
8516 unsigned NumOperands,
8517 struct MapperAllocas &MapperAllocas) {
8518 if (!updateToLocation(Loc))
8519 return;
8520
8521 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8522 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8523 Builder.restoreIP(AllocaIP);
8524 AllocaInst *ArgsBase = Builder.CreateAlloca(
8525 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8526 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8527 ".offload_ptrs");
8528 AllocaInst *ArgSizes = Builder.CreateAlloca(
8529 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8530 updateToLocation(Loc);
8531 MapperAllocas.ArgsBase = ArgsBase;
8532 MapperAllocas.Args = Args;
8533 MapperAllocas.ArgSizes = ArgSizes;
8534}
8535
8536void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8537 Function *MapperFunc, Value *SrcLocInfo,
8538 Value *MaptypesArg, Value *MapnamesArg,
8539 struct MapperAllocas &MapperAllocas,
8540 int64_t DeviceID, unsigned NumOperands) {
8541 if (!updateToLocation(Loc))
8542 return;
8543
8544 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8545 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8546 Value *ArgsBaseGEP =
8547 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8548 {Builder.getInt32(0), Builder.getInt32(0)});
8549 Value *ArgsGEP =
8550 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8551 {Builder.getInt32(0), Builder.getInt32(0)});
8552 Value *ArgSizesGEP =
8553 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8554 {Builder.getInt32(0), Builder.getInt32(0)});
8555 Value *NullPtr =
8556 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8557 Builder.CreateCall(MapperFunc,
8558 {SrcLocInfo, Builder.getInt64(DeviceID),
8559 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8560 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8561}
8562
8563void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8564 TargetDataRTArgs &RTArgs,
8565 TargetDataInfo &Info,
8566 bool ForEndCall) {
8567 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8568 "expected region end call to runtime only when end call is separate");
8569 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8570 auto VoidPtrTy = UnqualPtrTy;
8571 auto VoidPtrPtrTy = UnqualPtrTy;
8572 auto Int64Ty = Type::getInt64Ty(M.getContext());
8573 auto Int64PtrTy = UnqualPtrTy;
8574
8575 if (!Info.NumberOfPtrs) {
8576 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8577 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8578 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8579 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8580 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8581 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8582 return;
8583 }
8584
8585 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8586 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8587 Info.RTArgs.BasePointersArray,
8588 /*Idx0=*/0, /*Idx1=*/0);
8589 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8590 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8591 /*Idx0=*/0,
8592 /*Idx1=*/0);
8593 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8594 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8595 /*Idx0=*/0, /*Idx1=*/0);
8596 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8597 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8598 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8599 : Info.RTArgs.MapTypesArray,
8600 /*Idx0=*/0,
8601 /*Idx1=*/0);
8602
8603 // Only emit the mapper information arrays if debug information is
8604 // requested.
8605 if (!Info.EmitDebug)
8606 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8607 else
8608 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8609 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8610 /*Idx0=*/0,
8611 /*Idx1=*/0);
8612 // If there is no user-defined mapper, set the mapper array to nullptr to
8613 // avoid an unnecessary data privatization
8614 if (!Info.HasMapper)
8615 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8616 else
8617 RTArgs.MappersArray =
8618 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8619}
8620
8621void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8622 InsertPointTy CodeGenIP,
8623 MapInfosTy &CombinedInfo,
8624 TargetDataInfo &Info) {
8625 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8626 CombinedInfo.NonContigInfo;
8627
8628 // Build an array of struct descriptor_dim and then assign it to
8629 // offload_args.
8630 //
8631 // struct descriptor_dim {
8632 // uint64_t offset;
8633 // uint64_t count;
8634 // uint64_t stride
8635 // };
8636 Type *Int64Ty = Builder.getInt64Ty();
8638 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8639 "struct.descriptor_dim");
8640
8641 enum { OffsetFD = 0, CountFD, StrideFD };
8642 // We need two index variable here since the size of "Dims" is the same as
8643 // the size of Components, however, the size of offset, count, and stride is
8644 // equal to the size of base declaration that is non-contiguous.
8645 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8646 // Skip emitting ir if dimension size is 1 since it cannot be
8647 // non-contiguous.
8648 if (NonContigInfo.Dims[I] == 1)
8649 continue;
8650 Builder.restoreIP(AllocaIP);
8651 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8652 AllocaInst *DimsAddr =
8653 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8654 Builder.restoreIP(CodeGenIP);
8655 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8656 unsigned RevIdx = EE - II - 1;
8657 Value *DimsLVal = Builder.CreateInBoundsGEP(
8658 DimsAddr->getAllocatedType(), DimsAddr,
8659 {Builder.getInt64(0), Builder.getInt64(II)});
8660 // Offset
8661 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8662 Builder.CreateAlignedStore(
8663 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8664 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8665 // Count
8666 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8667 Builder.CreateAlignedStore(
8668 NonContigInfo.Counts[L][RevIdx], CountLVal,
8669 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8670 // Stride
8671 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8672 Builder.CreateAlignedStore(
8673 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8674 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8675 }
8676 // args[I] = &dims
8677 Builder.restoreIP(CodeGenIP);
8678 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8679 DimsAddr, Builder.getPtrTy());
8680 Value *P = Builder.CreateConstInBoundsGEP2_32(
8681 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8682 Info.RTArgs.PointersArray, 0, I);
8683 Builder.CreateAlignedStore(
8684 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8685 ++L;
8686 }
8687}
8688
8689void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8690 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8691 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8692 BasicBlock *ExitBB, bool IsInit) {
8693 StringRef Prefix = IsInit ? ".init" : ".del";
8694
8695 // Evaluate if this is an array section.
8697 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8698 Value *IsArray =
8699 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8700 Value *DeleteBit = Builder.CreateAnd(
8701 MapType,
8702 Builder.getInt64(
8703 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8704 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8705 Value *DeleteCond;
8706 Value *Cond;
8707 if (IsInit) {
8708 // base != begin?
8709 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8710 // IsPtrAndObj?
8711 Value *PtrAndObjBit = Builder.CreateAnd(
8712 MapType,
8713 Builder.getInt64(
8714 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8715 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8716 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8717 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8718 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8719 DeleteCond = Builder.CreateIsNull(
8720 DeleteBit,
8721 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8722 } else {
8723 Cond = IsArray;
8724 DeleteCond = Builder.CreateIsNotNull(
8725 DeleteBit,
8726 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8727 }
8728 Cond = Builder.CreateAnd(Cond, DeleteCond);
8729 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8730
8731 emitBlock(BodyBB, MapperFn);
8732 // Get the array size by multiplying element size and element number (i.e., \p
8733 // Size).
8734 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8735 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8736 // memory allocation/deletion purpose only.
8737 Value *MapTypeArg = Builder.CreateAnd(
8738 MapType,
8739 Builder.getInt64(
8740 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8741 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8742 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8743 MapTypeArg = Builder.CreateOr(
8744 MapTypeArg,
8745 Builder.getInt64(
8746 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8747 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8748
8749 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8750 // data structure.
8751 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8752 ArraySize, MapTypeArg, MapName};
8753 Builder.CreateCall(
8754 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8755 OffloadingArgs);
8756}
8757
8758Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8759 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8760 llvm::Value *BeginArg)>
8761 GenMapInfoCB,
8762 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8763 SmallVector<Type *> Params;
8764 Params.emplace_back(Builder.getPtrTy());
8765 Params.emplace_back(Builder.getPtrTy());
8766 Params.emplace_back(Builder.getPtrTy());
8767 Params.emplace_back(Builder.getInt64Ty());
8768 Params.emplace_back(Builder.getInt64Ty());
8769 Params.emplace_back(Builder.getPtrTy());
8770
8771 auto *FnTy =
8772 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8773
8774 SmallString<64> TyStr;
8775 raw_svector_ostream Out(TyStr);
8776 Function *MapperFn =
8778 MapperFn->addFnAttr(Attribute::NoInline);
8779 MapperFn->addFnAttr(Attribute::NoUnwind);
8780 MapperFn->addParamAttr(0, Attribute::NoUndef);
8781 MapperFn->addParamAttr(1, Attribute::NoUndef);
8782 MapperFn->addParamAttr(2, Attribute::NoUndef);
8783 MapperFn->addParamAttr(3, Attribute::NoUndef);
8784 MapperFn->addParamAttr(4, Attribute::NoUndef);
8785 MapperFn->addParamAttr(5, Attribute::NoUndef);
8786
8787 // Start the mapper function code generation.
8788 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8789 auto SavedIP = Builder.saveIP();
8790 Builder.SetInsertPoint(EntryBB);
8791
8792 Value *MapperHandle = MapperFn->getArg(0);
8793 Value *BaseIn = MapperFn->getArg(1);
8794 Value *BeginIn = MapperFn->getArg(2);
8795 Value *Size = MapperFn->getArg(3);
8796 Value *MapType = MapperFn->getArg(4);
8797 Value *MapName = MapperFn->getArg(5);
8798
8799 // Compute the starting and end addresses of array elements.
8800 // Prepare common arguments for array initiation and deletion.
8801 // Convert the size in bytes into the number of array elements.
8802 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8803 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8804 Value *PtrBegin = BeginIn;
8805 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8806
8807 // Emit array initiation if this is an array section and \p MapType indicates
8808 // that memory allocation is required.
8809 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8810 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8811 MapType, MapName, ElementSize, HeadBB,
8812 /*IsInit=*/true);
8813
8814 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8815
8816 // Emit the loop header block.
8817 emitBlock(HeadBB, MapperFn);
8818 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8819 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8820 // Evaluate whether the initial condition is satisfied.
8821 Value *IsEmpty =
8822 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8823 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8824
8825 // Emit the loop body block.
8826 emitBlock(BodyBB, MapperFn);
8827 BasicBlock *LastBB = BodyBB;
8828 PHINode *PtrPHI =
8829 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8830 PtrPHI->addIncoming(PtrBegin, HeadBB);
8831
8832 // Get map clause information. Fill up the arrays with all mapped variables.
8833 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8834 if (!Info)
8835 return Info.takeError();
8836
8837 // Call the runtime API __tgt_mapper_num_components to get the number of
8838 // pre-existing components.
8839 Value *OffloadingArgs[] = {MapperHandle};
8840 Value *PreviousSize = Builder.CreateCall(
8841 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8842 OffloadingArgs);
8843 Value *ShiftedPreviousSize =
8844 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8845
8846 // Fill up the runtime mapper handle for all components.
8847 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8848 Value *CurBaseArg = Info->BasePointers[I];
8849 Value *CurBeginArg = Info->Pointers[I];
8850 Value *CurSizeArg = Info->Sizes[I];
8851 Value *CurNameArg = Info->Names.size()
8852 ? Info->Names[I]
8853 : Constant::getNullValue(Builder.getPtrTy());
8854
8855 // Extract the MEMBER_OF field from the map type.
8856 Value *OriMapType = Builder.getInt64(
8857 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8858 Info->Types[I]));
8859 Value *MemberMapType =
8860 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8861
8862 // Combine the map type inherited from user-defined mapper with that
8863 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8864 // bits of the \a MapType, which is the input argument of the mapper
8865 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8866 // bits of MemberMapType.
8867 // [OpenMP 5.0], 1.2.6. map-type decay.
8868 // | alloc | to | from | tofrom | release | delete
8869 // ----------------------------------------------------------
8870 // alloc | alloc | alloc | alloc | alloc | release | delete
8871 // to | alloc | to | alloc | to | release | delete
8872 // from | alloc | alloc | from | from | release | delete
8873 // tofrom | alloc | to | from | tofrom | release | delete
8874 Value *LeftToFrom = Builder.CreateAnd(
8875 MapType,
8876 Builder.getInt64(
8877 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8878 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8879 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8880 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8881 BasicBlock *AllocElseBB =
8882 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8883 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8884 BasicBlock *ToElseBB =
8885 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8886 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8887 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8888 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8889 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8890 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8891 emitBlock(AllocBB, MapperFn);
8892 Value *AllocMapType = Builder.CreateAnd(
8893 MemberMapType,
8894 Builder.getInt64(
8895 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8896 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8897 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8898 Builder.CreateBr(EndBB);
8899 emitBlock(AllocElseBB, MapperFn);
8900 Value *IsTo = Builder.CreateICmpEQ(
8901 LeftToFrom,
8902 Builder.getInt64(
8903 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8904 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8905 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8906 // In case of to, clear OMP_MAP_FROM.
8907 emitBlock(ToBB, MapperFn);
8908 Value *ToMapType = Builder.CreateAnd(
8909 MemberMapType,
8910 Builder.getInt64(
8911 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8912 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8913 Builder.CreateBr(EndBB);
8914 emitBlock(ToElseBB, MapperFn);
8915 Value *IsFrom = Builder.CreateICmpEQ(
8916 LeftToFrom,
8917 Builder.getInt64(
8918 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8919 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8920 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8921 // In case of from, clear OMP_MAP_TO.
8922 emitBlock(FromBB, MapperFn);
8923 Value *FromMapType = Builder.CreateAnd(
8924 MemberMapType,
8925 Builder.getInt64(
8926 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8927 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8928 // In case of tofrom, do nothing.
8929 emitBlock(EndBB, MapperFn);
8930 LastBB = EndBB;
8931 PHINode *CurMapType =
8932 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8933 CurMapType->addIncoming(AllocMapType, AllocBB);
8934 CurMapType->addIncoming(ToMapType, ToBB);
8935 CurMapType->addIncoming(FromMapType, FromBB);
8936 CurMapType->addIncoming(MemberMapType, ToElseBB);
8937
8938 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8939 CurSizeArg, CurMapType, CurNameArg};
8940
8941 auto ChildMapperFn = CustomMapperCB(I);
8942 if (!ChildMapperFn)
8943 return ChildMapperFn.takeError();
8944 if (*ChildMapperFn) {
8945 // Call the corresponding mapper function.
8946 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8947 } else {
8948 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8949 // data structure.
8950 Builder.CreateCall(
8951 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8952 OffloadingArgs);
8953 }
8954 }
8955
8956 // Update the pointer to point to the next element that needs to be mapped,
8957 // and check whether we have mapped all elements.
8958 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8959 "omp.arraymap.next");
8960 PtrPHI->addIncoming(PtrNext, LastBB);
8961 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8962 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8963 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8964
8965 emitBlock(ExitBB, MapperFn);
8966 // Emit array deletion if this is an array section and \p MapType indicates
8967 // that deletion is required.
8968 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8969 MapType, MapName, ElementSize, DoneBB,
8970 /*IsInit=*/false);
8971
8972 // Emit the function exit block.
8973 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8974
8975 Builder.CreateRetVoid();
8976 Builder.restoreIP(SavedIP);
8977 return MapperFn;
8978}
8979
8980Error OpenMPIRBuilder::emitOffloadingArrays(
8981 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8982 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8983 bool IsNonContiguous,
8984 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8985
8986 // Reset the array information.
8987 Info.clearArrayInfo();
8988 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8989
8990 if (Info.NumberOfPtrs == 0)
8991 return Error::success();
8992
8993 Builder.restoreIP(AllocaIP);
8994 // Detect if we have any capture size requiring runtime evaluation of the
8995 // size so that a constant array could be eventually used.
8996 ArrayType *PointerArrayType =
8997 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8998
8999 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9000 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9001
9002 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9003 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9004 AllocaInst *MappersArray = Builder.CreateAlloca(
9005 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9006 Info.RTArgs.MappersArray = MappersArray;
9007
9008 // If we don't have any VLA types or other types that require runtime
9009 // evaluation, we can use a constant array for the map sizes, otherwise we
9010 // need to fill up the arrays as we do for the pointers.
9011 Type *Int64Ty = Builder.getInt64Ty();
9012 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9013 ConstantInt::get(Int64Ty, 0));
9014 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9015 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9016 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9017 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9018 if (IsNonContiguous &&
9019 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9020 CombinedInfo.Types[I] &
9021 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9022 ConstSizes[I] =
9023 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9024 else
9025 ConstSizes[I] = CI;
9026 continue;
9027 }
9028 }
9029 RuntimeSizes.set(I);
9030 }
9031
9032 if (RuntimeSizes.all()) {
9033 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9034 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9035 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9036 restoreIPandDebugLoc(Builder, CodeGenIP);
9037 } else {
9038 auto *SizesArrayInit = ConstantArray::get(
9039 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9040 std::string Name = createPlatformSpecificName({"offload_sizes"});
9041 auto *SizesArrayGbl =
9042 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9043 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9044 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9045
9046 if (!RuntimeSizes.any()) {
9047 Info.RTArgs.SizesArray = SizesArrayGbl;
9048 } else {
9049 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9050 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9051 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9052 AllocaInst *Buffer = Builder.CreateAlloca(
9053 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9054 Buffer->setAlignment(OffloadSizeAlign);
9055 restoreIPandDebugLoc(Builder, CodeGenIP);
9056 Builder.CreateMemCpy(
9057 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9058 SizesArrayGbl, OffloadSizeAlign,
9059 Builder.getIntN(
9060 IndexSize,
9061 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9062
9063 Info.RTArgs.SizesArray = Buffer;
9064 }
9065 restoreIPandDebugLoc(Builder, CodeGenIP);
9066 }
9067
9068 // The map types are always constant so we don't need to generate code to
9069 // fill arrays. Instead, we create an array constant.
9071 for (auto mapFlag : CombinedInfo.Types)
9072 Mapping.push_back(
9073 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9074 mapFlag));
9075 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9076 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9077 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9078
9079 // The information types are only built if provided.
9080 if (!CombinedInfo.Names.empty()) {
9081 auto *MapNamesArrayGbl = createOffloadMapnames(
9082 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9083 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9084 Info.EmitDebug = true;
9085 } else {
9086 Info.RTArgs.MapNamesArray =
9087 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9088 Info.EmitDebug = false;
9089 }
9090
9091 // If there's a present map type modifier, it must not be applied to the end
9092 // of a region, so generate a separate map type array in that case.
9093 if (Info.separateBeginEndCalls()) {
9094 bool EndMapTypesDiffer = false;
9095 for (uint64_t &Type : Mapping) {
9096 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9097 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9098 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9099 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9100 EndMapTypesDiffer = true;
9101 }
9102 }
9103 if (EndMapTypesDiffer) {
9104 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9105 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9106 }
9107 }
9108
9109 PointerType *PtrTy = Builder.getPtrTy();
9110 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9111 Value *BPVal = CombinedInfo.BasePointers[I];
9112 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9113 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9114 0, I);
9115 Builder.CreateAlignedStore(BPVal, BP,
9116 M.getDataLayout().getPrefTypeAlign(PtrTy));
9117
9118 if (Info.requiresDevicePointerInfo()) {
9119 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9120 CodeGenIP = Builder.saveIP();
9121 Builder.restoreIP(AllocaIP);
9122 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9123 Builder.restoreIP(CodeGenIP);
9124 if (DeviceAddrCB)
9125 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9126 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9127 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9128 if (DeviceAddrCB)
9129 DeviceAddrCB(I, BP);
9130 }
9131 }
9132
9133 Value *PVal = CombinedInfo.Pointers[I];
9134 Value *P = Builder.CreateConstInBoundsGEP2_32(
9135 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9136 I);
9137 // TODO: Check alignment correct.
9138 Builder.CreateAlignedStore(PVal, P,
9139 M.getDataLayout().getPrefTypeAlign(PtrTy));
9140
9141 if (RuntimeSizes.test(I)) {
9142 Value *S = Builder.CreateConstInBoundsGEP2_32(
9143 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9144 /*Idx0=*/0,
9145 /*Idx1=*/I);
9146 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9147 Int64Ty,
9148 /*isSigned=*/true),
9149 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9150 }
9151 // Fill up the mapper array.
9152 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9153 Value *MFunc = ConstantPointerNull::get(PtrTy);
9154
9155 auto CustomMFunc = CustomMapperCB(I);
9156 if (!CustomMFunc)
9157 return CustomMFunc.takeError();
9158 if (*CustomMFunc)
9159 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9160
9161 Value *MAddr = Builder.CreateInBoundsGEP(
9162 MappersArray->getAllocatedType(), MappersArray,
9163 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9164 Builder.CreateAlignedStore(
9165 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9166 }
9167
9168 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9169 Info.NumberOfPtrs == 0)
9170 return Error::success();
9171 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9172 return Error::success();
9173}
9174
9175void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9176 BasicBlock *CurBB = Builder.GetInsertBlock();
9177
9178 if (!CurBB || CurBB->getTerminator()) {
9179 // If there is no insert point or the previous block is already
9180 // terminated, don't touch it.
9181 } else {
9182 // Otherwise, create a fall-through branch.
9183 Builder.CreateBr(Target);
9184 }
9185
9186 Builder.ClearInsertionPoint();
9187}
9188
9189void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9190 bool IsFinished) {
9191 BasicBlock *CurBB = Builder.GetInsertBlock();
9192
9193 // Fall out of the current block (if necessary).
9194 emitBranch(BB);
9195
9196 if (IsFinished && BB->use_empty()) {
9197 BB->eraseFromParent();
9198 return;
9199 }
9200
9201 // Place the block after the current block, if possible, or else at
9202 // the end of the function.
9203 if (CurBB && CurBB->getParent())
9204 CurFn->insert(std::next(CurBB->getIterator()), BB);
9205 else
9206 CurFn->insert(CurFn->end(), BB);
9207 Builder.SetInsertPoint(BB);
9208}
9209
9210Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9211 BodyGenCallbackTy ElseGen,
9212 InsertPointTy AllocaIP) {
9213 // If the condition constant folds and can be elided, try to avoid emitting
9214 // the condition and the dead arm of the if/else.
9215 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9216 auto CondConstant = CI->getSExtValue();
9217 if (CondConstant)
9218 return ThenGen(AllocaIP, Builder.saveIP());
9219
9220 return ElseGen(AllocaIP, Builder.saveIP());
9221 }
9222
9223 Function *CurFn = Builder.GetInsertBlock()->getParent();
9224
9225 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9226 // emit the conditional branch.
9227 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9228 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9229 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9230 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9231 // Emit the 'then' code.
9232 emitBlock(ThenBlock, CurFn);
9233 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9234 return Err;
9235 emitBranch(ContBlock);
9236 // Emit the 'else' code if present.
9237 // There is no need to emit line number for unconditional branch.
9238 emitBlock(ElseBlock, CurFn);
9239 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9240 return Err;
9241 // There is no need to emit line number for unconditional branch.
9242 emitBranch(ContBlock);
9243 // Emit the continuation block for code after the if.
9244 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9245 return Error::success();
9246}
9247
9248bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9249 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9252 "Unexpected Atomic Ordering.");
9253
9254 bool Flush = false;
9256
9257 switch (AK) {
9258 case Read:
9261 FlushAO = AtomicOrdering::Acquire;
9262 Flush = true;
9263 }
9264 break;
9265 case Write:
9266 case Compare:
9267 case Update:
9270 FlushAO = AtomicOrdering::Release;
9271 Flush = true;
9272 }
9273 break;
9274 case Capture:
9275 switch (AO) {
9277 FlushAO = AtomicOrdering::Acquire;
9278 Flush = true;
9279 break;
9281 FlushAO = AtomicOrdering::Release;
9282 Flush = true;
9283 break;
9287 Flush = true;
9288 break;
9289 default:
9290 // do nothing - leave silently.
9291 break;
9292 }
9293 }
9294
9295 if (Flush) {
9296 // Currently Flush RT call still doesn't take memory_ordering, so for when
9297 // that happens, this tries to do the resolution of which atomic ordering
9298 // to use with but issue the flush call
9299 // TODO: pass `FlushAO` after memory ordering support is added
9300 (void)FlushAO;
9301 emitFlush(Loc);
9302 }
9303
9304 // for AO == AtomicOrdering::Monotonic and all other case combinations
9305 // do nothing
9306 return Flush;
9307}
9308
9309OpenMPIRBuilder::InsertPointTy
9310OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9311 AtomicOpValue &X, AtomicOpValue &V,
9312 AtomicOrdering AO, InsertPointTy AllocaIP) {
9313 if (!updateToLocation(Loc))
9314 return Loc.IP;
9315
9316 assert(X.Var->getType()->isPointerTy() &&
9317 "OMP Atomic expects a pointer to target memory");
9318 Type *XElemTy = X.ElemTy;
9319 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9320 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9321 "OMP atomic read expected a scalar type");
9322
9323 Value *XRead = nullptr;
9324
9325 if (XElemTy->isIntegerTy()) {
9326 LoadInst *XLD =
9327 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9328 XLD->setAtomic(AO);
9329 XRead = cast<Value>(XLD);
9330 } else if (XElemTy->isStructTy()) {
9331 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9332 // target does not support `atomicrmw` of the size of the struct
9333 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9334 OldVal->setAtomic(AO);
9335 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9336 unsigned LoadSize =
9337 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9338 OpenMPIRBuilder::AtomicInfo atomicInfo(
9339 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9340 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9341 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9342 XRead = AtomicLoadRes.first;
9343 OldVal->eraseFromParent();
9344 } else {
9345 // We need to perform atomic op as integer
9346 IntegerType *IntCastTy =
9347 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9348 LoadInst *XLoad =
9349 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9350 XLoad->setAtomic(AO);
9351 if (XElemTy->isFloatingPointTy()) {
9352 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9353 } else {
9354 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9355 }
9356 }
9357 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9358 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9359 return Builder.saveIP();
9360}
9361
9362OpenMPIRBuilder::InsertPointTy
9363OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9364 AtomicOpValue &X, Value *Expr,
9365 AtomicOrdering AO, InsertPointTy AllocaIP) {
9366 if (!updateToLocation(Loc))
9367 return Loc.IP;
9368
9369 assert(X.Var->getType()->isPointerTy() &&
9370 "OMP Atomic expects a pointer to target memory");
9371 Type *XElemTy = X.ElemTy;
9372 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9373 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9374 "OMP atomic write expected a scalar type");
9375
9376 if (XElemTy->isIntegerTy()) {
9377 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9378 XSt->setAtomic(AO);
9379 } else if (XElemTy->isStructTy()) {
9380 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9381 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9382 unsigned LoadSize =
9383 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9384 OpenMPIRBuilder::AtomicInfo atomicInfo(
9385 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9386 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9387 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9388 OldVal->eraseFromParent();
9389 } else {
9390 // We need to bitcast and perform atomic op as integers
9391 IntegerType *IntCastTy =
9392 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9393 Value *ExprCast =
9394 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9395 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9396 XSt->setAtomic(AO);
9397 }
9398
9399 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9400 return Builder.saveIP();
9401}
9402
9403OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9404 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9405 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9406 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9407 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9408 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9409 if (!updateToLocation(Loc))
9410 return Loc.IP;
9411
9412 LLVM_DEBUG({
9413 Type *XTy = X.Var->getType();
9414 assert(XTy->isPointerTy() &&
9415 "OMP Atomic expects a pointer to target memory");
9416 Type *XElemTy = X.ElemTy;
9417 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9418 XElemTy->isPointerTy()) &&
9419 "OMP atomic update expected a scalar type");
9420 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9421 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9422 "OpenMP atomic does not support LT or GT operations");
9423 });
9424
9425 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9426 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9427 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9428 if (!AtomicResult)
9429 return AtomicResult.takeError();
9430 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9431 return Builder.saveIP();
9432}
9433
9434// FIXME: Duplicating AtomicExpand
9435Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9436 AtomicRMWInst::BinOp RMWOp) {
9437 switch (RMWOp) {
9438 case AtomicRMWInst::Add:
9439 return Builder.CreateAdd(Src1, Src2);
9440 case AtomicRMWInst::Sub:
9441 return Builder.CreateSub(Src1, Src2);
9442 case AtomicRMWInst::And:
9443 return Builder.CreateAnd(Src1, Src2);
9445 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9446 case AtomicRMWInst::Or:
9447 return Builder.CreateOr(Src1, Src2);
9448 case AtomicRMWInst::Xor:
9449 return Builder.CreateXor(Src1, Src2);
9454 case AtomicRMWInst::Max:
9455 case AtomicRMWInst::Min:
9466 llvm_unreachable("Unsupported atomic update operation");
9467 }
9468 llvm_unreachable("Unsupported atomic update operation");
9469}
9470
9471Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9472 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9474 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9475 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9476 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9477 // or a complex datatype.
9478 bool emitRMWOp = false;
9479 switch (RMWOp) {
9480 case AtomicRMWInst::Add:
9481 case AtomicRMWInst::And:
9483 case AtomicRMWInst::Or:
9484 case AtomicRMWInst::Xor:
9486 emitRMWOp = XElemTy;
9487 break;
9488 case AtomicRMWInst::Sub:
9489 emitRMWOp = (IsXBinopExpr && XElemTy);
9490 break;
9491 default:
9492 emitRMWOp = false;
9493 }
9494 emitRMWOp &= XElemTy->isIntegerTy();
9495
9496 std::pair<Value *, Value *> Res;
9497 if (emitRMWOp) {
9498 AtomicRMWInst *RMWInst =
9499 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9500 if (T.isAMDGPU()) {
9501 if (IsIgnoreDenormalMode)
9502 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9503 llvm::MDNode::get(Builder.getContext(), {}));
9504 if (!IsFineGrainedMemory)
9505 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9506 llvm::MDNode::get(Builder.getContext(), {}));
9507 if (!IsRemoteMemory)
9508 RMWInst->setMetadata("amdgpu.no.remote.memory",
9509 llvm::MDNode::get(Builder.getContext(), {}));
9510 }
9511 Res.first = RMWInst;
9512 // not needed except in case of postfix captures. Generate anyway for
9513 // consistency with the else part. Will be removed with any DCE pass.
9514 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9515 if (RMWOp == AtomicRMWInst::Xchg)
9516 Res.second = Res.first;
9517 else
9518 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9519 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9520 XElemTy->isStructTy()) {
9521 LoadInst *OldVal =
9522 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9523 OldVal->setAtomic(AO);
9524 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9525 unsigned LoadSize =
9526 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9527
9528 OpenMPIRBuilder::AtomicInfo atomicInfo(
9529 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9530 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9531 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9532 BasicBlock *CurBB = Builder.GetInsertBlock();
9533 Instruction *CurBBTI = CurBB->getTerminator();
9534 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9535 BasicBlock *ExitBB =
9536 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9537 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9538 X->getName() + ".atomic.cont");
9539 ContBB->getTerminator()->eraseFromParent();
9540 Builder.restoreIP(AllocaIP);
9541 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9542 NewAtomicAddr->setName(X->getName() + "x.new.val");
9543 Builder.SetInsertPoint(ContBB);
9544 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9545 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9546 Value *OldExprVal = PHI;
9547 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9548 if (!CBResult)
9549 return CBResult.takeError();
9550 Value *Upd = *CBResult;
9551 Builder.CreateStore(Upd, NewAtomicAddr);
9554 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9555 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9556 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9557 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9558 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9559 OldVal->eraseFromParent();
9560 Res.first = OldExprVal;
9561 Res.second = Upd;
9562
9563 if (UnreachableInst *ExitTI =
9565 CurBBTI->eraseFromParent();
9566 Builder.SetInsertPoint(ExitBB);
9567 } else {
9568 Builder.SetInsertPoint(ExitTI);
9569 }
9570 } else {
9571 IntegerType *IntCastTy =
9572 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9573 LoadInst *OldVal =
9574 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9575 OldVal->setAtomic(AO);
9576 // CurBB
9577 // | /---\
9578 // ContBB |
9579 // | \---/
9580 // ExitBB
9581 BasicBlock *CurBB = Builder.GetInsertBlock();
9582 Instruction *CurBBTI = CurBB->getTerminator();
9583 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9584 BasicBlock *ExitBB =
9585 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9586 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9587 X->getName() + ".atomic.cont");
9588 ContBB->getTerminator()->eraseFromParent();
9589 Builder.restoreIP(AllocaIP);
9590 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9591 NewAtomicAddr->setName(X->getName() + "x.new.val");
9592 Builder.SetInsertPoint(ContBB);
9593 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9594 PHI->addIncoming(OldVal, CurBB);
9595 bool IsIntTy = XElemTy->isIntegerTy();
9596 Value *OldExprVal = PHI;
9597 if (!IsIntTy) {
9598 if (XElemTy->isFloatingPointTy()) {
9599 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9600 X->getName() + ".atomic.fltCast");
9601 } else {
9602 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9603 X->getName() + ".atomic.ptrCast");
9604 }
9605 }
9606
9607 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9608 if (!CBResult)
9609 return CBResult.takeError();
9610 Value *Upd = *CBResult;
9611 Builder.CreateStore(Upd, NewAtomicAddr);
9612 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9615 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9616 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9617 Result->setVolatile(VolatileX);
9618 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9619 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9620 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9621 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9622
9623 Res.first = OldExprVal;
9624 Res.second = Upd;
9625
9626 // set Insertion point in exit block
9627 if (UnreachableInst *ExitTI =
9629 CurBBTI->eraseFromParent();
9630 Builder.SetInsertPoint(ExitBB);
9631 } else {
9632 Builder.SetInsertPoint(ExitTI);
9633 }
9634 }
9635
9636 return Res;
9637}
9638
9639OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9640 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9641 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9642 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9643 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9644 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9645 if (!updateToLocation(Loc))
9646 return Loc.IP;
9647
9648 LLVM_DEBUG({
9649 Type *XTy = X.Var->getType();
9650 assert(XTy->isPointerTy() &&
9651 "OMP Atomic expects a pointer to target memory");
9652 Type *XElemTy = X.ElemTy;
9653 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9654 XElemTy->isPointerTy()) &&
9655 "OMP atomic capture expected a scalar type");
9656 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9657 "OpenMP atomic does not support LT or GT operations");
9658 });
9659
9660 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9661 // 'x' is simply atomically rewritten with 'expr'.
9662 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9663 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9664 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9665 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9666 if (!AtomicResult)
9667 return AtomicResult.takeError();
9668 Value *CapturedVal =
9669 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9670 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9671
9672 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9673 return Builder.saveIP();
9674}
9675
9676OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9677 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9678 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9679 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9680 bool IsFailOnly) {
9681
9683 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9684 IsPostfixUpdate, IsFailOnly, Failure);
9685}
9686
9687OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9688 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9689 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9690 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9691 bool IsFailOnly, AtomicOrdering Failure) {
9692
9693 if (!updateToLocation(Loc))
9694 return Loc.IP;
9695
9696 assert(X.Var->getType()->isPointerTy() &&
9697 "OMP atomic expects a pointer to target memory");
9698 // compare capture
9699 if (V.Var) {
9700 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9701 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9702 }
9703
9704 bool IsInteger = E->getType()->isIntegerTy();
9705
9706 if (Op == OMPAtomicCompareOp::EQ) {
9707 AtomicCmpXchgInst *Result = nullptr;
9708 if (!IsInteger) {
9709 IntegerType *IntCastTy =
9710 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9711 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9712 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9713 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9714 AO, Failure);
9715 } else {
9716 Result =
9717 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9718 }
9719
9720 if (V.Var) {
9721 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9722 if (!IsInteger)
9723 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9724 assert(OldValue->getType() == V.ElemTy &&
9725 "OldValue and V must be of same type");
9726 if (IsPostfixUpdate) {
9727 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9728 } else {
9729 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9730 if (IsFailOnly) {
9731 // CurBB----
9732 // | |
9733 // v |
9734 // ContBB |
9735 // | |
9736 // v |
9737 // ExitBB <-
9738 //
9739 // where ContBB only contains the store of old value to 'v'.
9740 BasicBlock *CurBB = Builder.GetInsertBlock();
9741 Instruction *CurBBTI = CurBB->getTerminator();
9742 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9743 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9744 CurBBTI, X.Var->getName() + ".atomic.exit");
9745 BasicBlock *ContBB = CurBB->splitBasicBlock(
9746 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9747 ContBB->getTerminator()->eraseFromParent();
9748 CurBB->getTerminator()->eraseFromParent();
9749
9750 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9751
9752 Builder.SetInsertPoint(ContBB);
9753 Builder.CreateStore(OldValue, V.Var);
9754 Builder.CreateBr(ExitBB);
9755
9756 if (UnreachableInst *ExitTI =
9758 CurBBTI->eraseFromParent();
9759 Builder.SetInsertPoint(ExitBB);
9760 } else {
9761 Builder.SetInsertPoint(ExitTI);
9762 }
9763 } else {
9764 Value *CapturedValue =
9765 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9766 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9767 }
9768 }
9769 }
9770 // The comparison result has to be stored.
9771 if (R.Var) {
9772 assert(R.Var->getType()->isPointerTy() &&
9773 "r.var must be of pointer type");
9774 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9775
9776 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9777 Value *ResultCast = R.IsSigned
9778 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9779 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9780 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9781 }
9782 } else {
9783 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9784 "Op should be either max or min at this point");
9785 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9786
9787 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9788 // Let's take max as example.
9789 // OpenMP form:
9790 // x = x > expr ? expr : x;
9791 // LLVM form:
9792 // *ptr = *ptr > val ? *ptr : val;
9793 // We need to transform to LLVM form.
9794 // x = x <= expr ? x : expr;
9796 if (IsXBinopExpr) {
9797 if (IsInteger) {
9798 if (X.IsSigned)
9799 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9801 else
9802 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9804 } else {
9805 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9807 }
9808 } else {
9809 if (IsInteger) {
9810 if (X.IsSigned)
9811 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9813 else
9814 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9816 } else {
9817 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9819 }
9820 }
9821
9822 AtomicRMWInst *OldValue =
9823 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9824 if (V.Var) {
9825 Value *CapturedValue = nullptr;
9826 if (IsPostfixUpdate) {
9827 CapturedValue = OldValue;
9828 } else {
9829 CmpInst::Predicate Pred;
9830 switch (NewOp) {
9831 case AtomicRMWInst::Max:
9832 Pred = CmpInst::ICMP_SGT;
9833 break;
9835 Pred = CmpInst::ICMP_UGT;
9836 break;
9838 Pred = CmpInst::FCMP_OGT;
9839 break;
9840 case AtomicRMWInst::Min:
9841 Pred = CmpInst::ICMP_SLT;
9842 break;
9844 Pred = CmpInst::ICMP_ULT;
9845 break;
9847 Pred = CmpInst::FCMP_OLT;
9848 break;
9849 default:
9850 llvm_unreachable("unexpected comparison op");
9851 }
9852 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9853 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9854 }
9855 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9856 }
9857 }
9858
9859 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9860
9861 return Builder.saveIP();
9862}
9863
9864OpenMPIRBuilder::InsertPointOrErrorTy
9865OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9866 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9867 Value *NumTeamsUpper, Value *ThreadLimit,
9868 Value *IfExpr) {
9869 if (!updateToLocation(Loc))
9870 return InsertPointTy();
9871
9872 uint32_t SrcLocStrSize;
9873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9875 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9876
9877 // Outer allocation basicblock is the entry block of the current function.
9878 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9879 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9880 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9881 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9882 }
9883
9884 // The current basic block is split into four basic blocks. After outlining,
9885 // they will be mapped as follows:
9886 // ```
9887 // def current_fn() {
9888 // current_basic_block:
9889 // br label %teams.exit
9890 // teams.exit:
9891 // ; instructions after teams
9892 // }
9893 //
9894 // def outlined_fn() {
9895 // teams.alloca:
9896 // br label %teams.body
9897 // teams.body:
9898 // ; instructions within teams body
9899 // }
9900 // ```
9901 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9902 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9903 BasicBlock *AllocaBB =
9904 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9905
9906 bool SubClausesPresent =
9907 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9908 // Push num_teams
9909 if (!Config.isTargetDevice() && SubClausesPresent) {
9910 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9911 "if lowerbound is non-null, then upperbound must also be non-null "
9912 "for bounds on num_teams");
9913
9914 if (NumTeamsUpper == nullptr)
9915 NumTeamsUpper = Builder.getInt32(0);
9916
9917 if (NumTeamsLower == nullptr)
9918 NumTeamsLower = NumTeamsUpper;
9919
9920 if (IfExpr) {
9921 assert(IfExpr->getType()->isIntegerTy() &&
9922 "argument to if clause must be an integer value");
9923
9924 // upper = ifexpr ? upper : 1
9925 if (IfExpr->getType() != Int1)
9926 IfExpr = Builder.CreateICmpNE(IfExpr,
9927 ConstantInt::get(IfExpr->getType(), 0));
9928 NumTeamsUpper = Builder.CreateSelect(
9929 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9930
9931 // lower = ifexpr ? lower : 1
9932 NumTeamsLower = Builder.CreateSelect(
9933 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9934 }
9935
9936 if (ThreadLimit == nullptr)
9937 ThreadLimit = Builder.getInt32(0);
9938
9939 Value *ThreadNum = getOrCreateThreadID(Ident);
9940 Builder.CreateCall(
9941 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9942 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9943 }
9944 // Generate the body of teams.
9945 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9946 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9947 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9948 return Err;
9949
9950 OutlineInfo OI;
9951 OI.EntryBB = AllocaBB;
9952 OI.ExitBB = ExitBB;
9953 OI.OuterAllocaBB = &OuterAllocaBB;
9954
9955 // Insert fake values for global tid and bound tid.
9957 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9958 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9959 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9960 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9961 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9962
9963 auto HostPostOutlineCB = [this, Ident,
9964 ToBeDeleted](Function &OutlinedFn) mutable {
9965 // The stale call instruction will be replaced with a new call instruction
9966 // for runtime call with the outlined function.
9967
9968 assert(OutlinedFn.hasOneUse() &&
9969 "there must be a single user for the outlined function");
9970 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9971 ToBeDeleted.push_back(StaleCI);
9972
9973 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9974 "Outlined function must have two or three arguments only");
9975
9976 bool HasShared = OutlinedFn.arg_size() == 3;
9977
9978 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9979 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9980 if (HasShared)
9981 OutlinedFn.getArg(2)->setName("data");
9982
9983 // Call to the runtime function for teams in the current function.
9984 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9985 "outlined function.");
9986 Builder.SetInsertPoint(StaleCI);
9988 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9989 if (HasShared)
9990 Args.push_back(StaleCI->getArgOperand(2));
9991 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9992 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9993 Args);
9994
9995 for (Instruction *I : llvm::reverse(ToBeDeleted))
9996 I->eraseFromParent();
9997 };
9998
9999 if (!Config.isTargetDevice())
10000 OI.PostOutlineCB = HostPostOutlineCB;
10001
10002 addOutlineInfo(std::move(OI));
10003
10004 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10005
10006 return Builder.saveIP();
10007}
10008
10009OpenMPIRBuilder::InsertPointOrErrorTy
10010OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10011 InsertPointTy OuterAllocaIP,
10012 BodyGenCallbackTy BodyGenCB) {
10013 if (!updateToLocation(Loc))
10014 return InsertPointTy();
10015
10016 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10017
10018 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10019 BasicBlock *BodyBB =
10020 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10021 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10022 }
10023 BasicBlock *ExitBB =
10024 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10025 BasicBlock *BodyBB =
10026 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10027 BasicBlock *AllocaBB =
10028 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10029
10030 // Generate the body of distribute clause
10031 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10032 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10033 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10034 return Err;
10035
10036 OutlineInfo OI;
10037 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10038 OI.EntryBB = AllocaBB;
10039 OI.ExitBB = ExitBB;
10040
10041 addOutlineInfo(std::move(OI));
10042 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10043
10044 return Builder.saveIP();
10045}
10046
10048OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10049 std::string VarName) {
10050 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10052 Names.size()),
10053 Names);
10054 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10055 M, MapNamesArrayInit->getType(),
10056 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10057 VarName);
10058 return MapNamesArrayGlobal;
10059}
10060
10061// Create all simple and struct types exposed by the runtime and remember
10062// the llvm::PointerTypes of them for easy access later.
10063void OpenMPIRBuilder::initializeTypes(Module &M) {
10064 LLVMContext &Ctx = M.getContext();
10065 StructType *T;
10066 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10067#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10068#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10069 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10070 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10071#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10072 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10073 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10074#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10075 T = StructType::getTypeByName(Ctx, StructName); \
10076 if (!T) \
10077 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10078 VarName = T; \
10079 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10080#include "llvm/Frontend/OpenMP/OMPKinds.def"
10081}
10082
10083void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10085 SmallVectorImpl<BasicBlock *> &BlockVector) {
10087 BlockSet.insert(EntryBB);
10088 BlockSet.insert(ExitBB);
10089
10090 Worklist.push_back(EntryBB);
10091 while (!Worklist.empty()) {
10092 BasicBlock *BB = Worklist.pop_back_val();
10093 BlockVector.push_back(BB);
10094 for (BasicBlock *SuccBB : successors(BB))
10095 if (BlockSet.insert(SuccBB).second)
10096 Worklist.push_back(SuccBB);
10097 }
10098}
10099
10100void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10101 uint64_t Size, int32_t Flags,
10103 StringRef Name) {
10104 if (!Config.isGPU()) {
10107 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10108 return;
10109 }
10110 // TODO: Add support for global variables on the device after declare target
10111 // support.
10112 Function *Fn = dyn_cast<Function>(Addr);
10113 if (!Fn)
10114 return;
10115
10116 // Add a function attribute for the kernel.
10117 Fn->addFnAttr("kernel");
10118 if (T.isAMDGCN())
10119 Fn->addFnAttr("uniform-work-group-size", "true");
10120 Fn->addFnAttr(Attribute::MustProgress);
10121}
10122
10123// We only generate metadata for function that contain target regions.
10124void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10125 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10126
10127 // If there are no entries, we don't need to do anything.
10128 if (OffloadInfoManager.empty())
10129 return;
10130
10131 LLVMContext &C = M.getContext();
10132 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10133 TargetRegionEntryInfo>,
10134 16>
10135 OrderedEntries(OffloadInfoManager.size());
10136
10137 // Auxiliary methods to create metadata values and strings.
10138 auto &&GetMDInt = [this](unsigned V) {
10139 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10140 };
10141
10142 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10143
10144 // Create the offloading info metadata node.
10145 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10146 auto &&TargetRegionMetadataEmitter =
10147 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10148 const TargetRegionEntryInfo &EntryInfo,
10149 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10150 // Generate metadata for target regions. Each entry of this metadata
10151 // contains:
10152 // - Entry 0 -> Kind of this type of metadata (0).
10153 // - Entry 1 -> Device ID of the file where the entry was identified.
10154 // - Entry 2 -> File ID of the file where the entry was identified.
10155 // - Entry 3 -> Mangled name of the function where the entry was
10156 // identified.
10157 // - Entry 4 -> Line in the file where the entry was identified.
10158 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10159 // - Entry 6 -> Order the entry was created.
10160 // The first element of the metadata node is the kind.
10161 Metadata *Ops[] = {
10162 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10163 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10164 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10165 GetMDInt(E.getOrder())};
10166
10167 // Save this entry in the right position of the ordered entries array.
10168 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10169
10170 // Add metadata to the named metadata node.
10171 MD->addOperand(MDNode::get(C, Ops));
10172 };
10173
10174 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10175
10176 // Create function that emits metadata for each device global variable entry;
10177 auto &&DeviceGlobalVarMetadataEmitter =
10178 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10179 StringRef MangledName,
10180 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10181 // Generate metadata for global variables. Each entry of this metadata
10182 // contains:
10183 // - Entry 0 -> Kind of this type of metadata (1).
10184 // - Entry 1 -> Mangled name of the variable.
10185 // - Entry 2 -> Declare target kind.
10186 // - Entry 3 -> Order the entry was created.
10187 // The first element of the metadata node is the kind.
10188 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10189 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10190
10191 // Save this entry in the right position of the ordered entries array.
10192 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10193 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10194
10195 // Add metadata to the named metadata node.
10196 MD->addOperand(MDNode::get(C, Ops));
10197 };
10198
10199 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10200 DeviceGlobalVarMetadataEmitter);
10201
10202 for (const auto &E : OrderedEntries) {
10203 assert(E.first && "All ordered entries must exist!");
10204 if (const auto *CE =
10206 E.first)) {
10207 if (!CE->getID() || !CE->getAddress()) {
10208 // Do not blame the entry if the parent funtion is not emitted.
10209 TargetRegionEntryInfo EntryInfo = E.second;
10210 StringRef FnName = EntryInfo.ParentName;
10211 if (!M.getNamedValue(FnName))
10212 continue;
10213 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10214 continue;
10215 }
10216 createOffloadEntry(CE->getID(), CE->getAddress(),
10217 /*Size=*/0, CE->getFlags(),
10219 } else if (const auto *CE = dyn_cast<
10220 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10221 E.first)) {
10222 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10223 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10224 CE->getFlags());
10225 switch (Flags) {
10226 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10227 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10228 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10229 continue;
10230 if (!CE->getAddress()) {
10231 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10232 continue;
10233 }
10234 // The vaiable has no definition - no need to add the entry.
10235 if (CE->getVarSize() == 0)
10236 continue;
10237 break;
10238 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10239 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10240 (!Config.isTargetDevice() && CE->getAddress())) &&
10241 "Declaret target link address is set.");
10242 if (Config.isTargetDevice())
10243 continue;
10244 if (!CE->getAddress()) {
10245 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10246 continue;
10247 }
10248 break;
10249 default:
10250 break;
10251 }
10252
10253 // Hidden or internal symbols on the device are not externally visible.
10254 // We should not attempt to register them by creating an offloading
10255 // entry. Indirect variables are handled separately on the device.
10256 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10257 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10258 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10259 continue;
10260
10261 // Indirect globals need to use a special name that doesn't match the name
10262 // of the associated host global.
10263 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10264 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10265 Flags, CE->getLinkage(), CE->getVarName());
10266 else
10267 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10268 Flags, CE->getLinkage());
10269
10270 } else {
10271 llvm_unreachable("Unsupported entry kind.");
10272 }
10273 }
10274
10275 // Emit requires directive globals to a special entry so the runtime can
10276 // register them when the device image is loaded.
10277 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10278 // entries should be redesigned to better suit this use-case.
10279 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10283 ".requires", /*Size=*/0,
10284 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10285 Config.getRequiresFlags());
10286}
10287
10288void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10289 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10290 unsigned FileID, unsigned Line, unsigned Count) {
10291 raw_svector_ostream OS(Name);
10292 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10293 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10294 if (Count)
10295 OS << "_" << Count;
10296}
10297
10298void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10299 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10300 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10301 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10302 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10303 EntryInfo.Line, NewCount);
10304}
10305
10306TargetRegionEntryInfo
10307OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10308 StringRef ParentName) {
10309 sys::fs::UniqueID ID(0xdeadf17e, 0);
10310 auto FileIDInfo = CallBack();
10311 uint64_t FileID = 0;
10312 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10313 // If the inode ID could not be determined, create a hash value
10314 // the current file name and use that as an ID.
10315 if (EC)
10316 FileID = hash_value(std::get<0>(FileIDInfo));
10317 else
10318 FileID = ID.getFile();
10319
10320 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10321 std::get<1>(FileIDInfo));
10322}
10323
10324unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10325 unsigned Offset = 0;
10326 for (uint64_t Remain =
10327 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10329 !(Remain & 1); Remain = Remain >> 1)
10330 Offset++;
10331 return Offset;
10332}
10333
10335OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10336 // Rotate by getFlagMemberOffset() bits.
10337 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10338 << getFlagMemberOffset());
10339}
10340
10341void OpenMPIRBuilder::setCorrectMemberOfFlag(
10343 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10344 // If the entry is PTR_AND_OBJ but has not been marked with the special
10345 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10346 // marked as MEMBER_OF.
10347 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10349 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10352 return;
10353
10354 // Reset the placeholder value to prepare the flag for the assignment of the
10355 // proper MEMBER_OF value.
10356 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10357 Flags |= MemberOfFlag;
10358}
10359
10360Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10361 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10362 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10363 bool IsDeclaration, bool IsExternallyVisible,
10364 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10365 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10366 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10367 std::function<Constant *()> GlobalInitializer,
10368 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10369 // TODO: convert this to utilise the IRBuilder Config rather than
10370 // a passed down argument.
10371 if (OpenMPSIMD)
10372 return nullptr;
10373
10374 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10375 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10376 CaptureClause ==
10377 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10378 Config.hasRequiresUnifiedSharedMemory())) {
10379 SmallString<64> PtrName;
10380 {
10381 raw_svector_ostream OS(PtrName);
10382 OS << MangledName;
10383 if (!IsExternallyVisible)
10384 OS << format("_%x", EntryInfo.FileID);
10385 OS << "_decl_tgt_ref_ptr";
10386 }
10387
10388 Value *Ptr = M.getNamedValue(PtrName);
10389
10390 if (!Ptr) {
10391 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10392 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10393
10394 auto *GV = cast<GlobalVariable>(Ptr);
10395 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10396
10397 if (!Config.isTargetDevice()) {
10398 if (GlobalInitializer)
10399 GV->setInitializer(GlobalInitializer());
10400 else
10401 GV->setInitializer(GlobalValue);
10402 }
10403
10404 registerTargetGlobalVariable(
10405 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10406 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10407 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10408 }
10409
10410 return cast<Constant>(Ptr);
10411 }
10412
10413 return nullptr;
10414}
10415
10416void OpenMPIRBuilder::registerTargetGlobalVariable(
10417 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10418 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10419 bool IsDeclaration, bool IsExternallyVisible,
10420 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10421 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10422 std::vector<Triple> TargetTriple,
10423 std::function<Constant *()> GlobalInitializer,
10424 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10425 Constant *Addr) {
10426 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10427 (TargetTriple.empty() && !Config.isTargetDevice()))
10428 return;
10429
10430 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10432 int64_t VarSize;
10434
10435 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10436 CaptureClause ==
10437 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10438 !Config.hasRequiresUnifiedSharedMemory()) {
10439 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10440 VarName = MangledName;
10441 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10442
10443 if (!IsDeclaration)
10444 VarSize = divideCeil(
10445 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10446 else
10447 VarSize = 0;
10448 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10449
10450 // This is a workaround carried over from Clang which prevents undesired
10451 // optimisation of internal variables.
10452 if (Config.isTargetDevice() &&
10453 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10454 // Do not create a "ref-variable" if the original is not also available
10455 // on the host.
10456 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10457 return;
10458
10459 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10460
10461 if (!M.getNamedValue(RefName)) {
10462 Constant *AddrRef =
10463 getOrCreateInternalVariable(Addr->getType(), RefName);
10464 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10465 GvAddrRef->setConstant(true);
10466 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10467 GvAddrRef->setInitializer(Addr);
10468 GeneratedRefs.push_back(GvAddrRef);
10469 }
10470 }
10471 } else {
10472 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10473 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10474 else
10475 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10476
10477 if (Config.isTargetDevice()) {
10478 VarName = (Addr) ? Addr->getName() : "";
10479 Addr = nullptr;
10480 } else {
10481 Addr = getAddrOfDeclareTargetVar(
10482 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10483 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10484 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10485 VarName = (Addr) ? Addr->getName() : "";
10486 }
10487 VarSize = M.getDataLayout().getPointerSize();
10489 }
10490
10491 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10492 Flags, Linkage);
10493}
10494
10495/// Loads all the offload entries information from the host IR
10496/// metadata.
10497void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10498 // If we are in target mode, load the metadata from the host IR. This code has
10499 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10500
10501 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10502 if (!MD)
10503 return;
10504
10505 for (MDNode *MN : MD->operands()) {
10506 auto &&GetMDInt = [MN](unsigned Idx) {
10507 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10508 return cast<ConstantInt>(V->getValue())->getZExtValue();
10509 };
10510
10511 auto &&GetMDString = [MN](unsigned Idx) {
10512 auto *V = cast<MDString>(MN->getOperand(Idx));
10513 return V->getString();
10514 };
10515
10516 switch (GetMDInt(0)) {
10517 default:
10518 llvm_unreachable("Unexpected metadata!");
10519 break;
10520 case OffloadEntriesInfoManager::OffloadEntryInfo::
10521 OffloadingEntryInfoTargetRegion: {
10522 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10523 /*DeviceID=*/GetMDInt(1),
10524 /*FileID=*/GetMDInt(2),
10525 /*Line=*/GetMDInt(4),
10526 /*Count=*/GetMDInt(5));
10527 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10528 /*Order=*/GetMDInt(6));
10529 break;
10530 }
10531 case OffloadEntriesInfoManager::OffloadEntryInfo::
10532 OffloadingEntryInfoDeviceGlobalVar:
10533 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10534 /*MangledName=*/GetMDString(1),
10535 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10536 /*Flags=*/GetMDInt(2)),
10537 /*Order=*/GetMDInt(3));
10538 break;
10539 }
10540 }
10541}
10542
10543void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10544 StringRef HostFilePath) {
10545 if (HostFilePath.empty())
10546 return;
10547
10548 auto Buf = VFS.getBufferForFile(HostFilePath);
10549 if (std::error_code Err = Buf.getError()) {
10550 report_fatal_error(("error opening host file from host file path inside of "
10551 "OpenMPIRBuilder: " +
10552 Err.message())
10553 .c_str());
10554 }
10555
10556 LLVMContext Ctx;
10558 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10559 if (std::error_code Err = M.getError()) {
10561 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10562 .c_str());
10563 }
10564
10565 loadOffloadInfoMetadata(*M.get());
10566}
10567
10568//===----------------------------------------------------------------------===//
10569// OffloadEntriesInfoManager
10570//===----------------------------------------------------------------------===//
10571
10572bool OffloadEntriesInfoManager::empty() const {
10573 return OffloadEntriesTargetRegion.empty() &&
10574 OffloadEntriesDeviceGlobalVar.empty();
10575}
10576
10577unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10578 const TargetRegionEntryInfo &EntryInfo) const {
10579 auto It = OffloadEntriesTargetRegionCount.find(
10580 getTargetRegionEntryCountKey(EntryInfo));
10581 if (It == OffloadEntriesTargetRegionCount.end())
10582 return 0;
10583 return It->second;
10584}
10585
10586void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10587 const TargetRegionEntryInfo &EntryInfo) {
10588 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10589 EntryInfo.Count + 1;
10590}
10591
10592/// Initialize target region entry.
10593void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10594 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10595 OffloadEntriesTargetRegion[EntryInfo] =
10596 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10597 OMPTargetRegionEntryTargetRegion);
10598 ++OffloadingEntriesNum;
10599}
10600
10601void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10602 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10603 OMPTargetRegionEntryKind Flags) {
10604 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10605
10606 // Update the EntryInfo with the next available count for this location.
10607 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10608
10609 // If we are emitting code for a target, the entry is already initialized,
10610 // only has to be registered.
10611 if (OMPBuilder->Config.isTargetDevice()) {
10612 // This could happen if the device compilation is invoked standalone.
10613 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10614 return;
10615 }
10616 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10617 Entry.setAddress(Addr);
10618 Entry.setID(ID);
10619 Entry.setFlags(Flags);
10620 } else {
10621 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10622 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10623 return;
10624 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10625 "Target region entry already registered!");
10626 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10627 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10628 ++OffloadingEntriesNum;
10629 }
10630 incrementTargetRegionEntryInfoCount(EntryInfo);
10631}
10632
10633bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10634 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10635
10636 // Update the EntryInfo with the next available count for this location.
10637 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10638
10639 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10640 if (It == OffloadEntriesTargetRegion.end()) {
10641 return false;
10642 }
10643 // Fail if this entry is already registered.
10644 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10645 return false;
10646 return true;
10647}
10648
10649void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10650 const OffloadTargetRegionEntryInfoActTy &Action) {
10651 // Scan all target region entries and perform the provided action.
10652 for (const auto &It : OffloadEntriesTargetRegion) {
10653 Action(It.first, It.second);
10654 }
10655}
10656
10657void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10658 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10659 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10660 ++OffloadingEntriesNum;
10661}
10662
10663void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10664 StringRef VarName, Constant *Addr, int64_t VarSize,
10665 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10666 if (OMPBuilder->Config.isTargetDevice()) {
10667 // This could happen if the device compilation is invoked standalone.
10668 if (!hasDeviceGlobalVarEntryInfo(VarName))
10669 return;
10670 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10671 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10672 if (Entry.getVarSize() == 0) {
10673 Entry.setVarSize(VarSize);
10674 Entry.setLinkage(Linkage);
10675 }
10676 return;
10677 }
10678 Entry.setVarSize(VarSize);
10679 Entry.setLinkage(Linkage);
10680 Entry.setAddress(Addr);
10681 } else {
10682 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10683 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10684 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10685 "Entry not initialized!");
10686 if (Entry.getVarSize() == 0) {
10687 Entry.setVarSize(VarSize);
10688 Entry.setLinkage(Linkage);
10689 }
10690 return;
10691 }
10692 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10693 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10694 Addr, VarSize, Flags, Linkage,
10695 VarName.str());
10696 else
10697 OffloadEntriesDeviceGlobalVar.try_emplace(
10698 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10699 ++OffloadingEntriesNum;
10700 }
10701}
10702
10703void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10704 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10705 // Scan all target region entries and perform the provided action.
10706 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10707 Action(E.getKey(), E.getValue());
10708}
10709
10710//===----------------------------------------------------------------------===//
10711// CanonicalLoopInfo
10712//===----------------------------------------------------------------------===//
10713
10714void CanonicalLoopInfo::collectControlBlocks(
10716 // We only count those BBs as control block for which we do not need to
10717 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10718 // flow. For consistency, this also means we do not add the Body block, which
10719 // is just the entry to the body code.
10720 BBs.reserve(BBs.size() + 6);
10721 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10722}
10723
10724BasicBlock *CanonicalLoopInfo::getPreheader() const {
10725 assert(isValid() && "Requires a valid canonical loop");
10726 for (BasicBlock *Pred : predecessors(Header)) {
10727 if (Pred != Latch)
10728 return Pred;
10729 }
10730 llvm_unreachable("Missing preheader");
10731}
10732
10733void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10734 assert(isValid() && "Requires a valid canonical loop");
10735
10736 Instruction *CmpI = &getCond()->front();
10737 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10738 CmpI->setOperand(1, TripCount);
10739
10740#ifndef NDEBUG
10741 assertOK();
10742#endif
10743}
10744
10745void CanonicalLoopInfo::mapIndVar(
10746 llvm::function_ref<Value *(Instruction *)> Updater) {
10747 assert(isValid() && "Requires a valid canonical loop");
10748
10749 Instruction *OldIV = getIndVar();
10750
10751 // Record all uses excluding those introduced by the updater. Uses by the
10752 // CanonicalLoopInfo itself to keep track of the number of iterations are
10753 // excluded.
10754 SmallVector<Use *> ReplacableUses;
10755 for (Use &U : OldIV->uses()) {
10756 auto *User = dyn_cast<Instruction>(U.getUser());
10757 if (!User)
10758 continue;
10759 if (User->getParent() == getCond())
10760 continue;
10761 if (User->getParent() == getLatch())
10762 continue;
10763 ReplacableUses.push_back(&U);
10764 }
10765
10766 // Run the updater that may introduce new uses
10767 Value *NewIV = Updater(OldIV);
10768
10769 // Replace the old uses with the value returned by the updater.
10770 for (Use *U : ReplacableUses)
10771 U->set(NewIV);
10772
10773#ifndef NDEBUG
10774 assertOK();
10775#endif
10776}
10777
10778void CanonicalLoopInfo::assertOK() const {
10779#ifndef NDEBUG
10780 // No constraints if this object currently does not describe a loop.
10781 if (!isValid())
10782 return;
10783
10784 BasicBlock *Preheader = getPreheader();
10785 BasicBlock *Body = getBody();
10786 BasicBlock *After = getAfter();
10787
10788 // Verify standard control-flow we use for OpenMP loops.
10789 assert(Preheader);
10790 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10791 "Preheader must terminate with unconditional branch");
10792 assert(Preheader->getSingleSuccessor() == Header &&
10793 "Preheader must jump to header");
10794
10795 assert(Header);
10796 assert(isa<BranchInst>(Header->getTerminator()) &&
10797 "Header must terminate with unconditional branch");
10798 assert(Header->getSingleSuccessor() == Cond &&
10799 "Header must jump to exiting block");
10800
10801 assert(Cond);
10802 assert(Cond->getSinglePredecessor() == Header &&
10803 "Exiting block only reachable from header");
10804
10805 assert(isa<BranchInst>(Cond->getTerminator()) &&
10806 "Exiting block must terminate with conditional branch");
10807 assert(size(successors(Cond)) == 2 &&
10808 "Exiting block must have two successors");
10809 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10810 "Exiting block's first successor jump to the body");
10811 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10812 "Exiting block's second successor must exit the loop");
10813
10814 assert(Body);
10815 assert(Body->getSinglePredecessor() == Cond &&
10816 "Body only reachable from exiting block");
10817 assert(!isa<PHINode>(Body->front()));
10818
10819 assert(Latch);
10821 "Latch must terminate with unconditional branch");
10822 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10823 // TODO: To support simple redirecting of the end of the body code that has
10824 // multiple; introduce another auxiliary basic block like preheader and after.
10825 assert(Latch->getSinglePredecessor() != nullptr);
10826 assert(!isa<PHINode>(Latch->front()));
10827
10828 assert(Exit);
10829 assert(isa<BranchInst>(Exit->getTerminator()) &&
10830 "Exit block must terminate with unconditional branch");
10831 assert(Exit->getSingleSuccessor() == After &&
10832 "Exit block must jump to after block");
10833
10834 assert(After);
10835 assert(After->getSinglePredecessor() == Exit &&
10836 "After block only reachable from exit block");
10837 assert(After->empty() || !isa<PHINode>(After->front()));
10838
10839 Instruction *IndVar = getIndVar();
10840 assert(IndVar && "Canonical induction variable not found?");
10841 assert(isa<IntegerType>(IndVar->getType()) &&
10842 "Induction variable must be an integer");
10843 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10844 "Induction variable must be a PHI in the loop header");
10845 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10846 assert(
10847 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10848 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10849
10850 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10851 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10852 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10853 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10854 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10855 ->isOne());
10856
10857 Value *TripCount = getTripCount();
10858 assert(TripCount && "Loop trip count not found?");
10859 assert(IndVar->getType() == TripCount->getType() &&
10860 "Trip count and induction variable must have the same type");
10861
10862 auto *CmpI = cast<CmpInst>(&Cond->front());
10863 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10864 "Exit condition must be a signed less-than comparison");
10865 assert(CmpI->getOperand(0) == IndVar &&
10866 "Exit condition must compare the induction variable");
10867 assert(CmpI->getOperand(1) == TripCount &&
10868 "Exit condition must compare with the trip count");
10869#endif
10870}
10871
10872void CanonicalLoopInfo::invalidate() {
10873 Header = nullptr;
10874 Cond = nullptr;
10875 Latch = nullptr;
10876 Exit = nullptr;
10877}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:466
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:523
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1573
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1443
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1753
iterator_range< op_iterator > operands()
Definition Metadata.h:1849
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:247
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:233
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:461
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:626
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1112
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:824
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1666
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:834
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2461
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:294
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2125
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:627
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:401
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1728
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...