LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr const size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510
3511 Builder.restoreIP(*AfterIP);
3512 Builder.CreateStore(Reduced, LHSPtr);
3513 }
3514 }
3515
3516 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3517 for (auto En : enumerate(ReductionInfos)) {
3518 unsigned Index = En.index();
3519 const ReductionInfo &RI = En.value();
3520 Value *LHSFixupPtr, *RHSFixupPtr;
3521 Builder.restoreIP(RI.ReductionGenClang(
3522 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3523
3524 // Fix the CallBack code genereated to use the correct Values for the LHS
3525 // and RHS
3526 LHSFixupPtr->replaceUsesWithIf(
3527 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3528 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3529 ReductionFunc;
3530 });
3531 RHSFixupPtr->replaceUsesWithIf(
3532 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3533 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3534 ReductionFunc;
3535 });
3536 }
3537
3538 Builder.CreateRetVoid();
3539 return ReductionFunc;
3540}
3541
3542static void
3544 bool IsGPU) {
3545 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3546 (void)RI;
3547 assert(RI.Variable && "expected non-null variable");
3548 assert(RI.PrivateVariable && "expected non-null private variable");
3549 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3550 "expected non-null reduction generator callback");
3551 if (!IsGPU) {
3552 assert(
3553 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3554 "expected variables and their private equivalents to have the same "
3555 "type");
3556 }
3557 assert(RI.Variable->getType()->isPointerTy() &&
3558 "expected variables to be pointers");
3559 }
3560}
3561
3562OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3563 const LocationDescription &Loc, InsertPointTy AllocaIP,
3564 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3565 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3566 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3567 Value *SrcLocInfo) {
3568 if (!updateToLocation(Loc))
3569 return InsertPointTy();
3570 Builder.restoreIP(CodeGenIP);
3571 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3572 LLVMContext &Ctx = M.getContext();
3573
3574 // Source location for the ident struct
3575 if (!SrcLocInfo) {
3576 uint32_t SrcLocStrSize;
3577 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3578 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3579 }
3580
3581 if (ReductionInfos.size() == 0)
3582 return Builder.saveIP();
3583
3584 BasicBlock *ContinuationBlock = nullptr;
3585 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3586 // Copied code from createReductions
3587 BasicBlock *InsertBlock = Loc.IP.getBlock();
3588 ContinuationBlock =
3589 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3590 InsertBlock->getTerminator()->eraseFromParent();
3591 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3592 }
3593
3594 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3595 AttributeList FuncAttrs;
3596 AttrBuilder AttrBldr(Ctx);
3597 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3598 AttrBldr.addAttribute(Attr);
3599 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3600 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3601
3602 CodeGenIP = Builder.saveIP();
3603 Expected<Function *> ReductionResult =
3604 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3605 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3606 if (!ReductionResult)
3607 return ReductionResult.takeError();
3608 Function *ReductionFunc = *ReductionResult;
3609 Builder.restoreIP(CodeGenIP);
3610
3611 // Set the grid value in the config needed for lowering later on
3612 if (GridValue.has_value())
3613 Config.setGridValue(GridValue.value());
3614 else
3615 Config.setGridValue(getGridValue(T, ReductionFunc));
3616
3617 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3618 // RedList, shuffle_reduce_func, interwarp_copy_func);
3619 // or
3620 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3621 Value *Res;
3622
3623 // 1. Build a list of reduction variables.
3624 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3625 auto Size = ReductionInfos.size();
3626 Type *PtrTy = PointerType::getUnqual(Ctx);
3627 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3628 CodeGenIP = Builder.saveIP();
3629 Builder.restoreIP(AllocaIP);
3630 Value *ReductionListAlloca =
3631 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3632 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3633 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3634 Builder.restoreIP(CodeGenIP);
3635 Type *IndexTy = Builder.getIndexTy(
3636 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3637 for (auto En : enumerate(ReductionInfos)) {
3638 const ReductionInfo &RI = En.value();
3639 Value *ElemPtr = Builder.CreateInBoundsGEP(
3640 RedArrayTy, ReductionList,
3641 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3642 Value *CastElem =
3643 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3644 Builder.CreateStore(CastElem, ElemPtr);
3645 }
3646 CodeGenIP = Builder.saveIP();
3647 Function *SarFunc =
3648 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3649 Expected<Function *> CopyResult =
3650 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3651 if (!CopyResult)
3652 return CopyResult.takeError();
3653 Function *WcFunc = *CopyResult;
3654 Builder.restoreIP(CodeGenIP);
3655
3656 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3657
3658 unsigned MaxDataSize = 0;
3659 SmallVector<Type *> ReductionTypeArgs;
3660 for (auto En : enumerate(ReductionInfos)) {
3661 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3662 if (Size > MaxDataSize)
3663 MaxDataSize = Size;
3664 ReductionTypeArgs.emplace_back(En.value().ElementType);
3665 }
3666 Value *ReductionDataSize =
3667 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3668 if (!IsTeamsReduction) {
3669 Value *SarFuncCast =
3670 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
3671 Value *WcFuncCast =
3672 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
3673 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3674 WcFuncCast};
3675 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3676 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3677 Res = Builder.CreateCall(Pv2Ptr, Args);
3678 } else {
3679 CodeGenIP = Builder.saveIP();
3680 StructType *ReductionsBufferTy = StructType::create(
3681 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3682 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3683 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3684 Function *LtGCFunc = emitListToGlobalCopyFunction(
3685 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3686 Function *LtGRFunc = emitListToGlobalReduceFunction(
3687 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3688 Function *GtLCFunc = emitGlobalToListCopyFunction(
3689 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3690 Function *GtLRFunc = emitGlobalToListReduceFunction(
3691 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3692 Builder.restoreIP(CodeGenIP);
3693
3694 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3695 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3696
3697 Value *Args3[] = {SrcLocInfo,
3698 KernelTeamsReductionPtr,
3699 Builder.getInt32(ReductionBufNum),
3700 ReductionDataSize,
3701 RL,
3702 SarFunc,
3703 WcFunc,
3704 LtGCFunc,
3705 LtGRFunc,
3706 GtLCFunc,
3707 GtLRFunc};
3708
3709 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3710 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3711 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3712 }
3713
3714 // 5. Build if (res == 1)
3715 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3716 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3717 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3718 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3719
3720 // 6. Build then branch: where we have reduced values in the master
3721 // thread in each team.
3722 // __kmpc_end_reduce{_nowait}(<gtid>);
3723 // break;
3724 emitBlock(ThenBB, CurFunc);
3725
3726 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3727 for (auto En : enumerate(ReductionInfos)) {
3728 const ReductionInfo &RI = En.value();
3729 Value *LHS = RI.Variable;
3730 Value *RHS =
3731 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3732
3733 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3734 Value *LHSPtr, *RHSPtr;
3735 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3736 &LHSPtr, &RHSPtr, CurFunc));
3737
3738 // Fix the CallBack code genereated to use the correct Values for the LHS
3739 // and RHS
3740 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3741 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3742 ReductionFunc;
3743 });
3744 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3745 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3746 ReductionFunc;
3747 });
3748 } else {
3749 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3750 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3751 Value *Reduced;
3752 InsertPointOrErrorTy AfterIP =
3753 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3754 if (!AfterIP)
3755 return AfterIP.takeError();
3756 Builder.restoreIP(*AfterIP);
3757 Builder.CreateStore(Reduced, LHS, false);
3758 }
3759 }
3760 emitBlock(ExitBB, CurFunc);
3761 if (ContinuationBlock) {
3762 Builder.CreateBr(ContinuationBlock);
3763 Builder.SetInsertPoint(ContinuationBlock);
3764 }
3765 Config.setEmitLLVMUsed();
3766
3767 return Builder.saveIP();
3768}
3769
3771 Type *VoidTy = Type::getVoidTy(M.getContext());
3772 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3773 auto *FuncTy =
3774 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3776 ".omp.reduction.func", &M);
3777}
3778
3780 Function *ReductionFunc,
3782 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3783 Module *Module = ReductionFunc->getParent();
3784 BasicBlock *ReductionFuncBlock =
3785 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3786 Builder.SetInsertPoint(ReductionFuncBlock);
3787 Value *LHSArrayPtr = nullptr;
3788 Value *RHSArrayPtr = nullptr;
3789 if (IsGPU) {
3790 // Need to alloca memory here and deal with the pointers before getting
3791 // LHS/RHS pointers out
3792 //
3793 Argument *Arg0 = ReductionFunc->getArg(0);
3794 Argument *Arg1 = ReductionFunc->getArg(1);
3795 Type *Arg0Type = Arg0->getType();
3796 Type *Arg1Type = Arg1->getType();
3797
3798 Value *LHSAlloca =
3799 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3800 Value *RHSAlloca =
3801 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3802 Value *LHSAddrCast =
3803 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3804 Value *RHSAddrCast =
3805 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3806 Builder.CreateStore(Arg0, LHSAddrCast);
3807 Builder.CreateStore(Arg1, RHSAddrCast);
3808 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3809 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3810 } else {
3811 LHSArrayPtr = ReductionFunc->getArg(0);
3812 RHSArrayPtr = ReductionFunc->getArg(1);
3813 }
3814
3815 unsigned NumReductions = ReductionInfos.size();
3816 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3817
3818 for (auto En : enumerate(ReductionInfos)) {
3819 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3820 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3821 RedArrayTy, LHSArrayPtr, 0, En.index());
3822 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3823 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3824 LHSI8Ptr, RI.Variable->getType());
3825 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3826 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3827 RedArrayTy, RHSArrayPtr, 0, En.index());
3828 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3829 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3830 RHSI8Ptr, RI.PrivateVariable->getType());
3831 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3832 Value *Reduced;
3833 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3834 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3835 if (!AfterIP)
3836 return AfterIP.takeError();
3837
3838 Builder.restoreIP(*AfterIP);
3839 // TODO: Consider flagging an error.
3840 if (!Builder.GetInsertBlock())
3841 return Error::success();
3842
3843 // store is inside of the reduction region when using by-ref
3844 if (!IsByRef[En.index()])
3845 Builder.CreateStore(Reduced, LHSPtr);
3846 }
3847 Builder.CreateRetVoid();
3848 return Error::success();
3849}
3850
3851OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3852 const LocationDescription &Loc, InsertPointTy AllocaIP,
3853 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3854 bool IsNoWait, bool IsTeamsReduction) {
3855 assert(ReductionInfos.size() == IsByRef.size());
3856 if (Config.isGPU())
3857 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3858 IsNoWait, IsTeamsReduction);
3859
3860 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3861
3862 if (!updateToLocation(Loc))
3863 return InsertPointTy();
3864
3865 if (ReductionInfos.size() == 0)
3866 return Builder.saveIP();
3867
3868 BasicBlock *InsertBlock = Loc.IP.getBlock();
3869 BasicBlock *ContinuationBlock =
3870 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3871 InsertBlock->getTerminator()->eraseFromParent();
3872
3873 // Create and populate array of type-erased pointers to private reduction
3874 // values.
3875 unsigned NumReductions = ReductionInfos.size();
3876 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3877 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3878 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3879
3880 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3881
3882 for (auto En : enumerate(ReductionInfos)) {
3883 unsigned Index = En.index();
3884 const ReductionInfo &RI = En.value();
3885 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3886 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3887 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3888 }
3889
3890 // Emit a call to the runtime function that orchestrates the reduction.
3891 // Declare the reduction function in the process.
3892 Type *IndexTy = Builder.getIndexTy(
3893 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3894 Function *Func = Builder.GetInsertBlock()->getParent();
3895 Module *Module = Func->getParent();
3896 uint32_t SrcLocStrSize;
3897 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3898 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3899 return RI.AtomicReductionGen;
3900 });
3901 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3902 CanGenerateAtomic
3903 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3904 : IdentFlag(0));
3905 Value *ThreadId = getOrCreateThreadID(Ident);
3906 Constant *NumVariables = Builder.getInt32(NumReductions);
3907 const DataLayout &DL = Module->getDataLayout();
3908 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3909 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3910 Function *ReductionFunc = getFreshReductionFunc(*Module);
3911 Value *Lock = getOMPCriticalRegionLock(".reduction");
3912 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3913 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3914 : RuntimeFunction::OMPRTL___kmpc_reduce);
3915 CallInst *ReduceCall =
3916 Builder.CreateCall(ReduceFunc,
3917 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3918 ReductionFunc, Lock},
3919 "reduce");
3920
3921 // Create final reduction entry blocks for the atomic and non-atomic case.
3922 // Emit IR that dispatches control flow to one of the blocks based on the
3923 // reduction supporting the atomic mode.
3924 BasicBlock *NonAtomicRedBlock =
3925 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3926 BasicBlock *AtomicRedBlock =
3927 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3929 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3930 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3931 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3932
3933 // Populate the non-atomic reduction using the elementwise reduction function.
3934 // This loads the elements from the global and private variables and reduces
3935 // them before storing back the result to the global variable.
3936 Builder.SetInsertPoint(NonAtomicRedBlock);
3937 for (auto En : enumerate(ReductionInfos)) {
3938 const ReductionInfo &RI = En.value();
3939 Type *ValueType = RI.ElementType;
3940 // We have one less load for by-ref case because that load is now inside of
3941 // the reduction region
3942 Value *RedValue = RI.Variable;
3943 if (!IsByRef[En.index()]) {
3944 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3945 "red.value." + Twine(En.index()));
3946 }
3947 Value *PrivateRedValue =
3948 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3949 "red.private.value." + Twine(En.index()));
3950 Value *Reduced;
3951 InsertPointOrErrorTy AfterIP =
3952 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3953 if (!AfterIP)
3954 return AfterIP.takeError();
3955 Builder.restoreIP(*AfterIP);
3956
3957 if (!Builder.GetInsertBlock())
3958 return InsertPointTy();
3959 // for by-ref case, the load is inside of the reduction region
3960 if (!IsByRef[En.index()])
3961 Builder.CreateStore(Reduced, RI.Variable);
3962 }
3963 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3964 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3965 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3966 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3967 Builder.CreateBr(ContinuationBlock);
3968
3969 // Populate the atomic reduction using the atomic elementwise reduction
3970 // function. There are no loads/stores here because they will be happening
3971 // inside the atomic elementwise reduction.
3972 Builder.SetInsertPoint(AtomicRedBlock);
3973 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3974 for (const ReductionInfo &RI : ReductionInfos) {
3975 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3976 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3977 if (!AfterIP)
3978 return AfterIP.takeError();
3979 Builder.restoreIP(*AfterIP);
3980 if (!Builder.GetInsertBlock())
3981 return InsertPointTy();
3982 }
3983 Builder.CreateBr(ContinuationBlock);
3984 } else {
3985 Builder.CreateUnreachable();
3986 }
3987
3988 // Populate the outlined reduction function using the elementwise reduction
3989 // function. Partial values are extracted from the type-erased array of
3990 // pointers to private variables.
3991 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3992 IsByRef, /*isGPU=*/false);
3993 if (Err)
3994 return Err;
3995
3996 if (!Builder.GetInsertBlock())
3997 return InsertPointTy();
3998
3999 Builder.SetInsertPoint(ContinuationBlock);
4000 return Builder.saveIP();
4001}
4002
4003OpenMPIRBuilder::InsertPointOrErrorTy
4004OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4005 BodyGenCallbackTy BodyGenCB,
4006 FinalizeCallbackTy FiniCB) {
4007 if (!updateToLocation(Loc))
4008 return Loc.IP;
4009
4010 Directive OMPD = Directive::OMPD_master;
4011 uint32_t SrcLocStrSize;
4012 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4013 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4014 Value *ThreadId = getOrCreateThreadID(Ident);
4015 Value *Args[] = {Ident, ThreadId};
4016
4017 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4018 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4019
4020 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4021 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4022
4023 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4024 /*Conditional*/ true, /*hasFinalize*/ true);
4025}
4026
4027OpenMPIRBuilder::InsertPointOrErrorTy
4028OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4029 BodyGenCallbackTy BodyGenCB,
4030 FinalizeCallbackTy FiniCB, Value *Filter) {
4031 if (!updateToLocation(Loc))
4032 return Loc.IP;
4033
4034 Directive OMPD = Directive::OMPD_masked;
4035 uint32_t SrcLocStrSize;
4036 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4037 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4038 Value *ThreadId = getOrCreateThreadID(Ident);
4039 Value *Args[] = {Ident, ThreadId, Filter};
4040 Value *ArgsEnd[] = {Ident, ThreadId};
4041
4042 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4043 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4044
4045 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4046 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4047
4048 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4049 /*Conditional*/ true, /*hasFinalize*/ true);
4050}
4051
4053 llvm::FunctionCallee Callee,
4055 const llvm::Twine &Name) {
4056 llvm::CallInst *Call = Builder.CreateCall(
4057 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4058 Call->setDoesNotThrow();
4059 return Call;
4060}
4061
4062// Expects input basic block is dominated by BeforeScanBB.
4063// Once Scan directive is encountered, the code after scan directive should be
4064// dominated by AfterScanBB. Scan directive splits the code sequence to
4065// scan and input phase. Based on whether inclusive or exclusive
4066// clause is used in the scan directive and whether input loop or scan loop
4067// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4068// input loop and second is the scan loop. The code generated handles only
4069// inclusive scans now.
4070OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4071 const LocationDescription &Loc, InsertPointTy AllocaIP,
4072 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4073 bool IsInclusive, ScanInfo *ScanRedInfo) {
4074 if (ScanRedInfo->OMPFirstScanLoop) {
4075 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4076 ScanVarsType, ScanRedInfo);
4077 if (Err)
4078 return Err;
4079 }
4080 if (!updateToLocation(Loc))
4081 return Loc.IP;
4082
4083 llvm::Value *IV = ScanRedInfo->IV;
4084
4085 if (ScanRedInfo->OMPFirstScanLoop) {
4086 // Emit buffer[i] = red; at the end of the input phase.
4087 for (size_t i = 0; i < ScanVars.size(); i++) {
4088 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4089 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4090 Type *DestTy = ScanVarsType[i];
4091 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4092 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4093
4094 Builder.CreateStore(Src, Val);
4095 }
4096 }
4097 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4098 emitBlock(ScanRedInfo->OMPScanDispatch,
4099 Builder.GetInsertBlock()->getParent());
4100
4101 if (!ScanRedInfo->OMPFirstScanLoop) {
4102 IV = ScanRedInfo->IV;
4103 // Emit red = buffer[i]; at the entrance to the scan phase.
4104 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4105 for (size_t i = 0; i < ScanVars.size(); i++) {
4106 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4107 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4108 Type *DestTy = ScanVarsType[i];
4109 Value *SrcPtr =
4110 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4111 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4112 Builder.CreateStore(Src, ScanVars[i]);
4113 }
4114 }
4115
4116 // TODO: Update it to CreateBr and remove dead blocks
4117 llvm::Value *CmpI = Builder.getInt1(true);
4118 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4119 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4120 ScanRedInfo->OMPAfterScanBlock);
4121 } else {
4122 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4123 ScanRedInfo->OMPBeforeScanBlock);
4124 }
4125 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4126 Builder.GetInsertBlock()->getParent());
4127 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4128 return Builder.saveIP();
4129}
4130
4131Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4132 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4133 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4134
4135 Builder.restoreIP(AllocaIP);
4136 // Create the shared pointer at alloca IP.
4137 for (size_t i = 0; i < ScanVars.size(); i++) {
4138 llvm::Value *BuffPtr =
4139 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4140 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4141 }
4142
4143 // Allocate temporary buffer by master thread
4144 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4145 InsertPointTy CodeGenIP) -> Error {
4146 Builder.restoreIP(CodeGenIP);
4147 Value *AllocSpan =
4148 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4149 for (size_t i = 0; i < ScanVars.size(); i++) {
4150 Type *IntPtrTy = Builder.getInt32Ty();
4151 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4152 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4153 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4154 AllocSpan, nullptr, "arr");
4155 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4156 }
4157 return Error::success();
4158 };
4159 // TODO: Perform finalization actions for variables. This has to be
4160 // called for variables which have destructors/finalizers.
4161 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4162
4163 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4164 llvm::Value *FilterVal = Builder.getInt32(0);
4165 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4166 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4167
4168 if (!AfterIP)
4169 return AfterIP.takeError();
4170 Builder.restoreIP(*AfterIP);
4171 BasicBlock *InputBB = Builder.GetInsertBlock();
4172 if (InputBB->getTerminator())
4173 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4174 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4175 if (!AfterIP)
4176 return AfterIP.takeError();
4177 Builder.restoreIP(*AfterIP);
4178
4179 return Error::success();
4180}
4181
4182Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4183 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4184 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4185 InsertPointTy CodeGenIP) -> Error {
4186 Builder.restoreIP(CodeGenIP);
4187 for (ReductionInfo RedInfo : ReductionInfos) {
4188 Value *PrivateVar = RedInfo.PrivateVariable;
4189 Value *OrigVar = RedInfo.Variable;
4190 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4191 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4192
4193 Type *SrcTy = RedInfo.ElementType;
4194 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4195 "arrayOffset");
4196 Value *Src = Builder.CreateLoad(SrcTy, Val);
4197
4198 Builder.CreateStore(Src, OrigVar);
4199 Builder.CreateFree(Buff);
4200 }
4201 return Error::success();
4202 };
4203 // TODO: Perform finalization actions for variables. This has to be
4204 // called for variables which have destructors/finalizers.
4205 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4206
4207 if (ScanRedInfo->OMPScanFinish->getTerminator())
4208 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4209 else
4210 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4211
4212 llvm::Value *FilterVal = Builder.getInt32(0);
4213 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4214 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4215
4216 if (!AfterIP)
4217 return AfterIP.takeError();
4218 Builder.restoreIP(*AfterIP);
4219 BasicBlock *InputBB = Builder.GetInsertBlock();
4220 if (InputBB->getTerminator())
4221 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4222 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4223 if (!AfterIP)
4224 return AfterIP.takeError();
4225 Builder.restoreIP(*AfterIP);
4226 return Error::success();
4227}
4228
4229OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4230 const LocationDescription &Loc,
4232 ScanInfo *ScanRedInfo) {
4233
4234 if (!updateToLocation(Loc))
4235 return Loc.IP;
4236 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4237 InsertPointTy CodeGenIP) -> Error {
4238 Builder.restoreIP(CodeGenIP);
4239 Function *CurFn = Builder.GetInsertBlock()->getParent();
4240 // for (int k = 0; k <= ceil(log2(n)); ++k)
4241 llvm::BasicBlock *LoopBB =
4242 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4243 llvm::BasicBlock *ExitBB =
4244 splitBB(Builder, false, "omp.outer.log.scan.exit");
4246 Builder.GetInsertBlock()->getModule(),
4247 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4248 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4249 llvm::Value *Arg =
4250 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4251 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4253 Builder.GetInsertBlock()->getModule(),
4254 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4255 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4256 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4257 llvm::Value *NMin1 = Builder.CreateNUWSub(
4258 ScanRedInfo->Span,
4259 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4260 Builder.SetInsertPoint(InputBB);
4261 Builder.CreateBr(LoopBB);
4262 emitBlock(LoopBB, CurFn);
4263 Builder.SetInsertPoint(LoopBB);
4264
4265 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4266 // size pow2k = 1;
4267 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4268 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4269 InputBB);
4270 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4271 InputBB);
4272 // for (size i = n - 1; i >= 2 ^ k; --i)
4273 // tmp[i] op= tmp[i-pow2k];
4274 llvm::BasicBlock *InnerLoopBB =
4275 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4276 llvm::BasicBlock *InnerExitBB =
4277 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4278 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4279 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4280 emitBlock(InnerLoopBB, CurFn);
4281 Builder.SetInsertPoint(InnerLoopBB);
4282 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4283 IVal->addIncoming(NMin1, LoopBB);
4284 for (ReductionInfo RedInfo : ReductionInfos) {
4285 Value *ReductionVal = RedInfo.PrivateVariable;
4286 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4287 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4288 Type *DestTy = RedInfo.ElementType;
4289 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4290 Value *LHSPtr =
4291 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4292 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4293 Value *RHSPtr =
4294 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4295 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4296 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4298 InsertPointOrErrorTy AfterIP =
4299 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4300 if (!AfterIP)
4301 return AfterIP.takeError();
4302 Builder.CreateStore(Result, LHSPtr);
4303 }
4304 llvm::Value *NextIVal = Builder.CreateNUWSub(
4305 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4306 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4307 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4308 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4309 emitBlock(InnerExitBB, CurFn);
4310 llvm::Value *Next = Builder.CreateNUWAdd(
4311 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4312 Counter->addIncoming(Next, Builder.GetInsertBlock());
4313 // pow2k <<= 1;
4314 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4315 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4316 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4317 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4318 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4319 return Error::success();
4320 };
4321
4322 // TODO: Perform finalization actions for variables. This has to be
4323 // called for variables which have destructors/finalizers.
4324 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4325
4326 llvm::Value *FilterVal = Builder.getInt32(0);
4327 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4328 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4329
4330 if (!AfterIP)
4331 return AfterIP.takeError();
4332 Builder.restoreIP(*AfterIP);
4333 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4334
4335 if (!AfterIP)
4336 return AfterIP.takeError();
4337 Builder.restoreIP(*AfterIP);
4338 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4339 if (Err)
4340 return Err;
4341
4342 return AfterIP;
4343}
4344
4345Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4346 llvm::function_ref<Error()> InputLoopGen,
4347 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4348 ScanInfo *ScanRedInfo) {
4349
4350 {
4351 // Emit loop with input phase:
4352 // for (i: 0..<num_iters>) {
4353 // <input phase>;
4354 // buffer[i] = red;
4355 // }
4356 ScanRedInfo->OMPFirstScanLoop = true;
4357 Error Err = InputLoopGen();
4358 if (Err)
4359 return Err;
4360 }
4361 {
4362 // Emit loop with scan phase:
4363 // for (i: 0..<num_iters>) {
4364 // red = buffer[i];
4365 // <scan phase>;
4366 // }
4367 ScanRedInfo->OMPFirstScanLoop = false;
4368 Error Err = ScanLoopGen(Builder.saveIP());
4369 if (Err)
4370 return Err;
4371 }
4372 return Error::success();
4373}
4374
4375void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4376 Function *Fun = Builder.GetInsertBlock()->getParent();
4377 ScanRedInfo->OMPScanDispatch =
4378 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4379 ScanRedInfo->OMPAfterScanBlock =
4380 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4381 ScanRedInfo->OMPBeforeScanBlock =
4382 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4383 ScanRedInfo->OMPScanLoopExit =
4384 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4385}
4386CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4387 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4388 BasicBlock *PostInsertBefore, const Twine &Name) {
4389 Module *M = F->getParent();
4390 LLVMContext &Ctx = M->getContext();
4391 Type *IndVarTy = TripCount->getType();
4392
4393 // Create the basic block structure.
4394 BasicBlock *Preheader =
4395 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4396 BasicBlock *Header =
4397 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4398 BasicBlock *Cond =
4399 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4400 BasicBlock *Body =
4401 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4402 BasicBlock *Latch =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4404 BasicBlock *Exit =
4405 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4406 BasicBlock *After =
4407 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4408
4409 // Use specified DebugLoc for new instructions.
4410 Builder.SetCurrentDebugLocation(DL);
4411
4412 Builder.SetInsertPoint(Preheader);
4413 Builder.CreateBr(Header);
4414
4415 Builder.SetInsertPoint(Header);
4416 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4417 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4418 Builder.CreateBr(Cond);
4419
4420 Builder.SetInsertPoint(Cond);
4421 Value *Cmp =
4422 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4423 Builder.CreateCondBr(Cmp, Body, Exit);
4424
4425 Builder.SetInsertPoint(Body);
4426 Builder.CreateBr(Latch);
4427
4428 Builder.SetInsertPoint(Latch);
4429 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4430 "omp_" + Name + ".next", /*HasNUW=*/true);
4431 Builder.CreateBr(Header);
4432 IndVarPHI->addIncoming(Next, Latch);
4433
4434 Builder.SetInsertPoint(Exit);
4435 Builder.CreateBr(After);
4436
4437 // Remember and return the canonical control flow.
4438 LoopInfos.emplace_front();
4439 CanonicalLoopInfo *CL = &LoopInfos.front();
4440
4441 CL->Header = Header;
4442 CL->Cond = Cond;
4443 CL->Latch = Latch;
4444 CL->Exit = Exit;
4445
4446#ifndef NDEBUG
4447 CL->assertOK();
4448#endif
4449 return CL;
4450}
4451
4453OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4454 LoopBodyGenCallbackTy BodyGenCB,
4455 Value *TripCount, const Twine &Name) {
4456 BasicBlock *BB = Loc.IP.getBlock();
4457 BasicBlock *NextBB = BB->getNextNode();
4458
4459 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4460 NextBB, NextBB, Name);
4461 BasicBlock *After = CL->getAfter();
4462
4463 // If location is not set, don't connect the loop.
4464 if (updateToLocation(Loc)) {
4465 // Split the loop at the insertion point: Branch to the preheader and move
4466 // every following instruction to after the loop (the After BB). Also, the
4467 // new successor is the loop's after block.
4468 spliceBB(Builder, After, /*CreateBranch=*/false);
4469 Builder.CreateBr(CL->getPreheader());
4470 }
4471
4472 // Emit the body content. We do it after connecting the loop to the CFG to
4473 // avoid that the callback encounters degenerate BBs.
4474 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4475 return Err;
4476
4477#ifndef NDEBUG
4478 CL->assertOK();
4479#endif
4480 return CL;
4481}
4482
4483Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4484 ScanInfos.emplace_front();
4485 ScanInfo *Result = &ScanInfos.front();
4486 return Result;
4487}
4488
4490OpenMPIRBuilder::createCanonicalScanLoops(
4491 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4492 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4493 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4494 LocationDescription ComputeLoc =
4495 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4496 updateToLocation(ComputeLoc);
4497
4499
4500 Value *TripCount = calculateCanonicalLoopTripCount(
4501 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4502 ScanRedInfo->Span = TripCount;
4503 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4504 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4505
4506 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4507 Builder.restoreIP(CodeGenIP);
4508 ScanRedInfo->IV = IV;
4509 createScanBBs(ScanRedInfo);
4510 BasicBlock *InputBlock = Builder.GetInsertBlock();
4511 Instruction *Terminator = InputBlock->getTerminator();
4512 assert(Terminator->getNumSuccessors() == 1);
4513 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4514 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4515 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4516 Builder.GetInsertBlock()->getParent());
4517 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4518 emitBlock(ScanRedInfo->OMPScanLoopExit,
4519 Builder.GetInsertBlock()->getParent());
4520 Builder.CreateBr(ContinueBlock);
4521 Builder.SetInsertPoint(
4522 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4523 return BodyGenCB(Builder.saveIP(), IV);
4524 };
4525
4526 const auto &&InputLoopGen = [&]() -> Error {
4527 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4528 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4529 ComputeIP, Name, true, ScanRedInfo);
4530 if (!LoopInfo)
4531 return LoopInfo.takeError();
4532 Result.push_back(*LoopInfo);
4533 Builder.restoreIP((*LoopInfo)->getAfterIP());
4534 return Error::success();
4535 };
4536 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4538 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4539 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4540 if (!LoopInfo)
4541 return LoopInfo.takeError();
4542 Result.push_back(*LoopInfo);
4543 Builder.restoreIP((*LoopInfo)->getAfterIP());
4544 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4545 return Error::success();
4546 };
4547 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4548 if (Err)
4549 return Err;
4550 return Result;
4551}
4552
4553Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4554 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4555 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4556
4557 // Consider the following difficulties (assuming 8-bit signed integers):
4558 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4559 // DO I = 1, 100, 50
4560 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4561 // DO I = 100, 0, -128
4562
4563 // Start, Stop and Step must be of the same integer type.
4564 auto *IndVarTy = cast<IntegerType>(Start->getType());
4565 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4566 assert(IndVarTy == Step->getType() && "Step type mismatch");
4567
4568 updateToLocation(Loc);
4569
4570 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4571 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4572
4573 // Like Step, but always positive.
4574 Value *Incr = Step;
4575
4576 // Distance between Start and Stop; always positive.
4577 Value *Span;
4578
4579 // Condition whether there are no iterations are executed at all, e.g. because
4580 // UB < LB.
4581 Value *ZeroCmp;
4582
4583 if (IsSigned) {
4584 // Ensure that increment is positive. If not, negate and invert LB and UB.
4585 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4586 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4587 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4588 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4589 Span = Builder.CreateSub(UB, LB, "", false, true);
4590 ZeroCmp = Builder.CreateICmp(
4591 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4592 } else {
4593 Span = Builder.CreateSub(Stop, Start, "", true);
4594 ZeroCmp = Builder.CreateICmp(
4595 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4596 }
4597
4598 Value *CountIfLooping;
4599 if (InclusiveStop) {
4600 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4601 } else {
4602 // Avoid incrementing past stop since it could overflow.
4603 Value *CountIfTwo = Builder.CreateAdd(
4604 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4605 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4606 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4607 }
4608
4609 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4610 "omp_" + Name + ".tripcount");
4611}
4612
4613Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4614 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4615 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4616 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4617 ScanInfo *ScanRedInfo) {
4618 LocationDescription ComputeLoc =
4619 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4620
4621 Value *TripCount = calculateCanonicalLoopTripCount(
4622 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4623
4624 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4625 Builder.restoreIP(CodeGenIP);
4626 Value *Span = Builder.CreateMul(IV, Step);
4627 Value *IndVar = Builder.CreateAdd(Span, Start);
4628 if (InScan)
4629 ScanRedInfo->IV = IndVar;
4630 return BodyGenCB(Builder.saveIP(), IndVar);
4631 };
4632 LocationDescription LoopLoc =
4633 ComputeIP.isSet()
4634 ? Loc
4635 : LocationDescription(Builder.saveIP(),
4636 Builder.getCurrentDebugLocation());
4637 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4638}
4639
4640// Returns an LLVM function to call for initializing loop bounds using OpenMP
4641// static scheduling for composite `distribute parallel for` depending on
4642// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4643// integers as unsigned similarly to CanonicalLoopInfo.
4644static FunctionCallee
4646 OpenMPIRBuilder &OMPBuilder) {
4647 unsigned Bitwidth = Ty->getIntegerBitWidth();
4648 if (Bitwidth == 32)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4651 if (Bitwidth == 64)
4652 return OMPBuilder.getOrCreateRuntimeFunction(
4653 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4654 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4655}
4656
4657// Returns an LLVM function to call for initializing loop bounds using OpenMP
4658// static scheduling depending on `type`. Only i32 and i64 are supported by the
4659// runtime. Always interpret integers as unsigned similarly to
4660// CanonicalLoopInfo.
4662 OpenMPIRBuilder &OMPBuilder) {
4663 unsigned Bitwidth = Ty->getIntegerBitWidth();
4664 if (Bitwidth == 32)
4665 return OMPBuilder.getOrCreateRuntimeFunction(
4666 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4667 if (Bitwidth == 64)
4668 return OMPBuilder.getOrCreateRuntimeFunction(
4669 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4670 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4671}
4672
4673OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4674 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4675 WorksharingLoopType LoopType, bool NeedsBarrier) {
4676 assert(CLI->isValid() && "Requires a valid canonical loop");
4677 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4678 "Require dedicated allocate IP");
4679
4680 // Set up the source location value for OpenMP runtime.
4681 Builder.restoreIP(CLI->getPreheaderIP());
4682 Builder.SetCurrentDebugLocation(DL);
4683
4684 uint32_t SrcLocStrSize;
4685 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4686 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4687
4688 // Declare useful OpenMP runtime functions.
4689 Value *IV = CLI->getIndVar();
4690 Type *IVTy = IV->getType();
4691 FunctionCallee StaticInit =
4692 LoopType == WorksharingLoopType::DistributeForStaticLoop
4693 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4694 : getKmpcForStaticInitForType(IVTy, M, *this);
4695 FunctionCallee StaticFini =
4696 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4697
4698 // Allocate space for computed loop bounds as expected by the "init" function.
4699 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4700
4701 Type *I32Type = Type::getInt32Ty(M.getContext());
4702 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4703 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4704 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4705 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4706 CLI->setLastIter(PLastIter);
4707
4708 // At the end of the preheader, prepare for calling the "init" function by
4709 // storing the current loop bounds into the allocated space. A canonical loop
4710 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4711 // and produces an inclusive upper bound.
4712 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4713 Constant *Zero = ConstantInt::get(IVTy, 0);
4714 Constant *One = ConstantInt::get(IVTy, 1);
4715 Builder.CreateStore(Zero, PLowerBound);
4716 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4717 Builder.CreateStore(UpperBound, PUpperBound);
4718 Builder.CreateStore(One, PStride);
4719
4720 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4721
4722 OMPScheduleType SchedType =
4723 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4724 ? OMPScheduleType::OrderedDistribute
4726 Constant *SchedulingType =
4727 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4728
4729 // Call the "init" function and update the trip count of the loop with the
4730 // value it produced.
4732 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4733 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4734 Value *PDistUpperBound =
4735 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4736 Args.push_back(PDistUpperBound);
4737 }
4738 Args.append({PStride, One, Zero});
4739 Builder.CreateCall(StaticInit, Args);
4740 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4741 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4742 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4743 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4744 CLI->setTripCount(TripCount);
4745
4746 // Update all uses of the induction variable except the one in the condition
4747 // block that compares it with the actual upper bound, and the increment in
4748 // the latch block.
4749
4750 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4751 Builder.SetInsertPoint(CLI->getBody(),
4752 CLI->getBody()->getFirstInsertionPt());
4753 Builder.SetCurrentDebugLocation(DL);
4754 return Builder.CreateAdd(OldIV, LowerBound);
4755 });
4756
4757 // In the "exit" block, call the "fini" function.
4758 Builder.SetInsertPoint(CLI->getExit(),
4759 CLI->getExit()->getTerminator()->getIterator());
4760 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4761
4762 // Add the barrier if requested.
4763 if (NeedsBarrier) {
4764 InsertPointOrErrorTy BarrierIP =
4765 createBarrier(LocationDescription(Builder.saveIP(), DL),
4766 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4767 /* CheckCancelFlag */ false);
4768 if (!BarrierIP)
4769 return BarrierIP.takeError();
4770 }
4771
4772 InsertPointTy AfterIP = CLI->getAfterIP();
4773 CLI->invalidate();
4774
4775 return AfterIP;
4776}
4777
4778OpenMPIRBuilder::InsertPointOrErrorTy
4779OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4780 CanonicalLoopInfo *CLI,
4781 InsertPointTy AllocaIP,
4782 bool NeedsBarrier,
4783 Value *ChunkSize) {
4784 assert(CLI->isValid() && "Requires a valid canonical loop");
4785 assert(ChunkSize && "Chunk size is required");
4786
4787 LLVMContext &Ctx = CLI->getFunction()->getContext();
4788 Value *IV = CLI->getIndVar();
4789 Value *OrigTripCount = CLI->getTripCount();
4790 Type *IVTy = IV->getType();
4791 assert(IVTy->getIntegerBitWidth() <= 64 &&
4792 "Max supported tripcount bitwidth is 64 bits");
4793 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4794 : Type::getInt64Ty(Ctx);
4795 Type *I32Type = Type::getInt32Ty(M.getContext());
4796 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4797 Constant *One = ConstantInt::get(InternalIVTy, 1);
4798
4799 // Declare useful OpenMP runtime functions.
4800 FunctionCallee StaticInit =
4801 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4802 FunctionCallee StaticFini =
4803 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4804
4805 // Allocate space for computed loop bounds as expected by the "init" function.
4806 Builder.restoreIP(AllocaIP);
4807 Builder.SetCurrentDebugLocation(DL);
4808 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4809 Value *PLowerBound =
4810 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4811 Value *PUpperBound =
4812 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4813 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4814 CLI->setLastIter(PLastIter);
4815
4816 // Set up the source location value for the OpenMP runtime.
4817 Builder.restoreIP(CLI->getPreheaderIP());
4818 Builder.SetCurrentDebugLocation(DL);
4819
4820 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4821 Value *CastedChunkSize =
4822 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4823 Value *CastedTripCount =
4824 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4825
4826 Constant *SchedulingType = ConstantInt::get(
4827 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4828 Builder.CreateStore(Zero, PLowerBound);
4829 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4830 Builder.CreateStore(OrigUpperBound, PUpperBound);
4831 Builder.CreateStore(One, PStride);
4832
4833 // Call the "init" function and update the trip count of the loop with the
4834 // value it produced.
4835 uint32_t SrcLocStrSize;
4836 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4837 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4838 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4839 Builder.CreateCall(StaticInit,
4840 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4841 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4842 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4843 /*pstride=*/PStride, /*incr=*/One,
4844 /*chunk=*/CastedChunkSize});
4845
4846 // Load values written by the "init" function.
4847 Value *FirstChunkStart =
4848 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4849 Value *FirstChunkStop =
4850 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4851 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4852 Value *ChunkRange =
4853 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4854 Value *NextChunkStride =
4855 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4856
4857 // Create outer "dispatch" loop for enumerating the chunks.
4858 BasicBlock *DispatchEnter = splitBB(Builder, true);
4859 Value *DispatchCounter;
4860
4861 // It is safe to assume this didn't return an error because the callback
4862 // passed into createCanonicalLoop is the only possible error source, and it
4863 // always returns success.
4864 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4865 {Builder.saveIP(), DL},
4866 [&](InsertPointTy BodyIP, Value *Counter) {
4867 DispatchCounter = Counter;
4868 return Error::success();
4869 },
4870 FirstChunkStart, CastedTripCount, NextChunkStride,
4871 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4872 "dispatch"));
4873
4874 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4875 // not have to preserve the canonical invariant.
4876 BasicBlock *DispatchBody = DispatchCLI->getBody();
4877 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4878 BasicBlock *DispatchExit = DispatchCLI->getExit();
4879 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4880 DispatchCLI->invalidate();
4881
4882 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4883 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4884 redirectTo(CLI->getExit(), DispatchLatch, DL);
4885 redirectTo(DispatchBody, DispatchEnter, DL);
4886
4887 // Prepare the prolog of the chunk loop.
4888 Builder.restoreIP(CLI->getPreheaderIP());
4889 Builder.SetCurrentDebugLocation(DL);
4890
4891 // Compute the number of iterations of the chunk loop.
4892 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4893 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4894 Value *IsLastChunk =
4895 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4896 Value *CountUntilOrigTripCount =
4897 Builder.CreateSub(CastedTripCount, DispatchCounter);
4898 Value *ChunkTripCount = Builder.CreateSelect(
4899 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4900 Value *BackcastedChunkTC =
4901 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4902 CLI->setTripCount(BackcastedChunkTC);
4903
4904 // Update all uses of the induction variable except the one in the condition
4905 // block that compares it with the actual upper bound, and the increment in
4906 // the latch block.
4907 Value *BackcastedDispatchCounter =
4908 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4909 CLI->mapIndVar([&](Instruction *) -> Value * {
4910 Builder.restoreIP(CLI->getBodyIP());
4911 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4912 });
4913
4914 // In the "exit" block, call the "fini" function.
4915 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4916 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4917
4918 // Add the barrier if requested.
4919 if (NeedsBarrier) {
4920 InsertPointOrErrorTy AfterIP =
4921 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4922 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4923 if (!AfterIP)
4924 return AfterIP.takeError();
4925 }
4926
4927#ifndef NDEBUG
4928 // Even though we currently do not support applying additional methods to it,
4929 // the chunk loop should remain a canonical loop.
4930 CLI->assertOK();
4931#endif
4932
4933 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4934}
4935
4936// Returns an LLVM function to call for executing an OpenMP static worksharing
4937// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4938// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4939static FunctionCallee
4940getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4941 WorksharingLoopType LoopType) {
4942 unsigned Bitwidth = Ty->getIntegerBitWidth();
4943 Module &M = OMPBuilder->M;
4944 switch (LoopType) {
4945 case WorksharingLoopType::ForStaticLoop:
4946 if (Bitwidth == 32)
4947 return OMPBuilder->getOrCreateRuntimeFunction(
4948 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4949 if (Bitwidth == 64)
4950 return OMPBuilder->getOrCreateRuntimeFunction(
4951 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4952 break;
4953 case WorksharingLoopType::DistributeStaticLoop:
4954 if (Bitwidth == 32)
4955 return OMPBuilder->getOrCreateRuntimeFunction(
4956 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4957 if (Bitwidth == 64)
4958 return OMPBuilder->getOrCreateRuntimeFunction(
4959 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4960 break;
4961 case WorksharingLoopType::DistributeForStaticLoop:
4962 if (Bitwidth == 32)
4963 return OMPBuilder->getOrCreateRuntimeFunction(
4964 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4965 if (Bitwidth == 64)
4966 return OMPBuilder->getOrCreateRuntimeFunction(
4967 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4968 break;
4969 }
4970 if (Bitwidth != 32 && Bitwidth != 64) {
4971 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4972 }
4973 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4974}
4975
4976// Inserts a call to proper OpenMP Device RTL function which handles
4977// loop worksharing.
4978static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4979 WorksharingLoopType LoopType,
4980 BasicBlock *InsertBlock, Value *Ident,
4981 Value *LoopBodyArg, Value *TripCount,
4982 Function &LoopBodyFn, bool NoLoop) {
4983 Type *TripCountTy = TripCount->getType();
4984 Module &M = OMPBuilder->M;
4985 IRBuilder<> &Builder = OMPBuilder->Builder;
4986 FunctionCallee RTLFn =
4987 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4988 SmallVector<Value *, 8> RealArgs;
4989 RealArgs.push_back(Ident);
4990 RealArgs.push_back(&LoopBodyFn);
4991 RealArgs.push_back(LoopBodyArg);
4992 RealArgs.push_back(TripCount);
4993 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4994 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4995 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4996 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4997 Builder.CreateCall(RTLFn, RealArgs);
4998 return;
4999 }
5000 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5001 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5002 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5003 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5004
5005 RealArgs.push_back(
5006 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5007 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5008 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5009 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5010 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5011 } else {
5012 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5013 }
5014
5015 Builder.CreateCall(RTLFn, RealArgs);
5016}
5017
5019 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5020 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5021 WorksharingLoopType LoopType, bool NoLoop) {
5022 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5023 BasicBlock *Preheader = CLI->getPreheader();
5024 Value *TripCount = CLI->getTripCount();
5025
5026 // After loop body outling, the loop body contains only set up
5027 // of loop body argument structure and the call to the outlined
5028 // loop body function. Firstly, we need to move setup of loop body args
5029 // into loop preheader.
5030 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5031 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5032
5033 // The next step is to remove the whole loop. We do not it need anymore.
5034 // That's why make an unconditional branch from loop preheader to loop
5035 // exit block
5036 Builder.restoreIP({Preheader, Preheader->end()});
5037 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5038 Preheader->getTerminator()->eraseFromParent();
5039 Builder.CreateBr(CLI->getExit());
5040
5041 // Delete dead loop blocks
5042 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5043 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5044 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5045 CleanUpInfo.EntryBB = CLI->getHeader();
5046 CleanUpInfo.ExitBB = CLI->getExit();
5047 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5048 DeleteDeadBlocks(BlocksToBeRemoved);
5049
5050 // Find the instruction which corresponds to loop body argument structure
5051 // and remove the call to loop body function instruction.
5052 Value *LoopBodyArg;
5053 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5054 assert(OutlinedFnUser &&
5055 "Expected unique undroppable user of outlined function");
5056 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5057 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5058 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5059 "Expected outlined function call to be located in loop preheader");
5060 // Check in case no argument structure has been passed.
5061 if (OutlinedFnCallInstruction->arg_size() > 1)
5062 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5063 else
5064 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5065 OutlinedFnCallInstruction->eraseFromParent();
5066
5067 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5068 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5069
5070 for (auto &ToBeDeletedItem : ToBeDeleted)
5071 ToBeDeletedItem->eraseFromParent();
5072 CLI->invalidate();
5073}
5074
5075OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5076 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5077 WorksharingLoopType LoopType, bool NoLoop) {
5078 uint32_t SrcLocStrSize;
5079 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5080 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5081
5082 OutlineInfo OI;
5083 OI.OuterAllocaBB = CLI->getPreheader();
5084 Function *OuterFn = CLI->getPreheader()->getParent();
5085
5086 // Instructions which need to be deleted at the end of code generation
5088
5089 OI.OuterAllocaBB = AllocaIP.getBlock();
5090
5091 // Mark the body loop as region which needs to be extracted
5092 OI.EntryBB = CLI->getBody();
5093 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5094 "omp.prelatch", true);
5095
5096 // Prepare loop body for extraction
5097 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5098
5099 // Insert new loop counter variable which will be used only in loop
5100 // body.
5101 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5102 Instruction *NewLoopCntLoad =
5103 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5104 // New loop counter instructions are redundant in the loop preheader when
5105 // code generation for workshare loop is finshed. That's why mark them as
5106 // ready for deletion.
5107 ToBeDeleted.push_back(NewLoopCntLoad);
5108 ToBeDeleted.push_back(NewLoopCnt);
5109
5110 // Analyse loop body region. Find all input variables which are used inside
5111 // loop body region.
5112 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5114 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5115
5116 CodeExtractorAnalysisCache CEAC(*OuterFn);
5117 CodeExtractor Extractor(Blocks,
5118 /* DominatorTree */ nullptr,
5119 /* AggregateArgs */ true,
5120 /* BlockFrequencyInfo */ nullptr,
5121 /* BranchProbabilityInfo */ nullptr,
5122 /* AssumptionCache */ nullptr,
5123 /* AllowVarArgs */ true,
5124 /* AllowAlloca */ true,
5125 /* AllocationBlock */ CLI->getPreheader(),
5126 /* Suffix */ ".omp_wsloop",
5127 /* AggrArgsIn0AddrSpace */ true);
5128
5129 BasicBlock *CommonExit = nullptr;
5130 SetVector<Value *> SinkingCands, HoistingCands;
5131
5132 // Find allocas outside the loop body region which are used inside loop
5133 // body
5134 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5135
5136 // We need to model loop body region as the function f(cnt, loop_arg).
5137 // That's why we replace loop induction variable by the new counter
5138 // which will be one of loop body function argument
5139 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5140 CLI->getIndVar()->user_end());
5141 for (auto Use : Users) {
5142 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5143 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5144 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5145 }
5146 }
5147 }
5148 // Make sure that loop counter variable is not merged into loop body
5149 // function argument structure and it is passed as separate variable
5150 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5151
5152 // PostOutline CB is invoked when loop body function is outlined and
5153 // loop body is replaced by call to outlined function. We need to add
5154 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5155 // function will handle loop control logic.
5156 //
5157 OI.PostOutlineCB = [=, ToBeDeletedVec =
5158 std::move(ToBeDeleted)](Function &OutlinedFn) {
5159 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5160 LoopType, NoLoop);
5161 };
5162 addOutlineInfo(std::move(OI));
5163 return CLI->getAfterIP();
5164}
5165
5166OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5167 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5168 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5169 bool HasSimdModifier, bool HasMonotonicModifier,
5170 bool HasNonmonotonicModifier, bool HasOrderedClause,
5171 WorksharingLoopType LoopType, bool NoLoop) {
5172 if (Config.isTargetDevice())
5173 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5174 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5175 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5176 HasNonmonotonicModifier, HasOrderedClause);
5177
5178 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5179 OMPScheduleType::ModifierOrdered;
5180 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5181 case OMPScheduleType::BaseStatic:
5182 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5183 if (IsOrdered)
5184 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5185 NeedsBarrier, ChunkSize);
5186 // FIXME: Monotonicity ignored?
5187 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5188
5189 case OMPScheduleType::BaseStaticChunked:
5190 if (IsOrdered)
5191 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5192 NeedsBarrier, ChunkSize);
5193 // FIXME: Monotonicity ignored?
5194 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5195 ChunkSize);
5196
5197 case OMPScheduleType::BaseRuntime:
5198 case OMPScheduleType::BaseAuto:
5199 case OMPScheduleType::BaseGreedy:
5200 case OMPScheduleType::BaseBalanced:
5201 case OMPScheduleType::BaseSteal:
5202 case OMPScheduleType::BaseGuidedSimd:
5203 case OMPScheduleType::BaseRuntimeSimd:
5204 assert(!ChunkSize &&
5205 "schedule type does not support user-defined chunk sizes");
5206 [[fallthrough]];
5207 case OMPScheduleType::BaseDynamicChunked:
5208 case OMPScheduleType::BaseGuidedChunked:
5209 case OMPScheduleType::BaseGuidedIterativeChunked:
5210 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5211 case OMPScheduleType::BaseStaticBalancedChunked:
5212 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5213 NeedsBarrier, ChunkSize);
5214
5215 default:
5216 llvm_unreachable("Unknown/unimplemented schedule kind");
5217 }
5218}
5219
5220/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5221/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5222/// the runtime. Always interpret integers as unsigned similarly to
5223/// CanonicalLoopInfo.
5224static FunctionCallee
5225getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5226 unsigned Bitwidth = Ty->getIntegerBitWidth();
5227 if (Bitwidth == 32)
5228 return OMPBuilder.getOrCreateRuntimeFunction(
5229 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5230 if (Bitwidth == 64)
5231 return OMPBuilder.getOrCreateRuntimeFunction(
5232 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5233 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5234}
5235
5236/// Returns an LLVM function to call for updating the next loop using OpenMP
5237/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5238/// the runtime. Always interpret integers as unsigned similarly to
5239/// CanonicalLoopInfo.
5240static FunctionCallee
5241getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5242 unsigned Bitwidth = Ty->getIntegerBitWidth();
5243 if (Bitwidth == 32)
5244 return OMPBuilder.getOrCreateRuntimeFunction(
5245 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5246 if (Bitwidth == 64)
5247 return OMPBuilder.getOrCreateRuntimeFunction(
5248 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5249 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5250}
5251
5252/// Returns an LLVM function to call for finalizing the dynamic loop using
5253/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5254/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5255static FunctionCallee
5256getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5257 unsigned Bitwidth = Ty->getIntegerBitWidth();
5258 if (Bitwidth == 32)
5259 return OMPBuilder.getOrCreateRuntimeFunction(
5260 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5261 if (Bitwidth == 64)
5262 return OMPBuilder.getOrCreateRuntimeFunction(
5263 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5264 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5265}
5266
5267OpenMPIRBuilder::InsertPointOrErrorTy
5268OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5269 InsertPointTy AllocaIP,
5270 OMPScheduleType SchedType,
5271 bool NeedsBarrier, Value *Chunk) {
5272 assert(CLI->isValid() && "Requires a valid canonical loop");
5273 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5274 "Require dedicated allocate IP");
5276 "Require valid schedule type");
5277
5278 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5279 OMPScheduleType::ModifierOrdered;
5280
5281 // Set up the source location value for OpenMP runtime.
5282 Builder.SetCurrentDebugLocation(DL);
5283
5284 uint32_t SrcLocStrSize;
5285 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5286 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5287
5288 // Declare useful OpenMP runtime functions.
5289 Value *IV = CLI->getIndVar();
5290 Type *IVTy = IV->getType();
5291 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5292 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5293
5294 // Allocate space for computed loop bounds as expected by the "init" function.
5295 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5296 Type *I32Type = Type::getInt32Ty(M.getContext());
5297 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5298 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5299 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5300 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5301 CLI->setLastIter(PLastIter);
5302
5303 // At the end of the preheader, prepare for calling the "init" function by
5304 // storing the current loop bounds into the allocated space. A canonical loop
5305 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5306 // and produces an inclusive upper bound.
5307 BasicBlock *PreHeader = CLI->getPreheader();
5308 Builder.SetInsertPoint(PreHeader->getTerminator());
5309 Constant *One = ConstantInt::get(IVTy, 1);
5310 Builder.CreateStore(One, PLowerBound);
5311 Value *UpperBound = CLI->getTripCount();
5312 Builder.CreateStore(UpperBound, PUpperBound);
5313 Builder.CreateStore(One, PStride);
5314
5315 BasicBlock *Header = CLI->getHeader();
5316 BasicBlock *Exit = CLI->getExit();
5317 BasicBlock *Cond = CLI->getCond();
5318 BasicBlock *Latch = CLI->getLatch();
5319 InsertPointTy AfterIP = CLI->getAfterIP();
5320
5321 // The CLI will be "broken" in the code below, as the loop is no longer
5322 // a valid canonical loop.
5323
5324 if (!Chunk)
5325 Chunk = One;
5326
5327 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5328
5329 Constant *SchedulingType =
5330 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5331
5332 // Call the "init" function.
5333 Builder.CreateCall(DynamicInit,
5334 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5335 UpperBound, /* step */ One, Chunk});
5336
5337 // An outer loop around the existing one.
5338 BasicBlock *OuterCond = BasicBlock::Create(
5339 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5340 PreHeader->getParent());
5341 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5342 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5343 Value *Res =
5344 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5345 PLowerBound, PUpperBound, PStride});
5346 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5347 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5348 Value *LowerBound =
5349 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5350 Builder.CreateCondBr(MoreWork, Header, Exit);
5351
5352 // Change PHI-node in loop header to use outer cond rather than preheader,
5353 // and set IV to the LowerBound.
5354 Instruction *Phi = &Header->front();
5355 auto *PI = cast<PHINode>(Phi);
5356 PI->setIncomingBlock(0, OuterCond);
5357 PI->setIncomingValue(0, LowerBound);
5358
5359 // Then set the pre-header to jump to the OuterCond
5360 Instruction *Term = PreHeader->getTerminator();
5361 auto *Br = cast<BranchInst>(Term);
5362 Br->setSuccessor(0, OuterCond);
5363
5364 // Modify the inner condition:
5365 // * Use the UpperBound returned from the DynamicNext call.
5366 // * jump to the loop outer loop when done with one of the inner loops.
5367 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5368 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5369 Instruction *Comp = &*Builder.GetInsertPoint();
5370 auto *CI = cast<CmpInst>(Comp);
5371 CI->setOperand(1, UpperBound);
5372 // Redirect the inner exit to branch to outer condition.
5373 Instruction *Branch = &Cond->back();
5374 auto *BI = cast<BranchInst>(Branch);
5375 assert(BI->getSuccessor(1) == Exit);
5376 BI->setSuccessor(1, OuterCond);
5377
5378 // Call the "fini" function if "ordered" is present in wsloop directive.
5379 if (Ordered) {
5380 Builder.SetInsertPoint(&Latch->back());
5381 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5382 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5383 }
5384
5385 // Add the barrier if requested.
5386 if (NeedsBarrier) {
5387 Builder.SetInsertPoint(&Exit->back());
5388 InsertPointOrErrorTy BarrierIP =
5389 createBarrier(LocationDescription(Builder.saveIP(), DL),
5390 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5391 /* CheckCancelFlag */ false);
5392 if (!BarrierIP)
5393 return BarrierIP.takeError();
5394 }
5395
5396 CLI->invalidate();
5397 return AfterIP;
5398}
5399
5400/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5401/// after this \p OldTarget will be orphaned.
5403 BasicBlock *NewTarget, DebugLoc DL) {
5404 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5405 redirectTo(Pred, NewTarget, DL);
5406}
5407
5408/// Determine which blocks in \p BBs are reachable from outside and remove the
5409/// ones that are not reachable from the function.
5412 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5413 for (Use &U : BB->uses()) {
5414 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5415 if (!UseInst)
5416 continue;
5417 if (BBsToErase.count(UseInst->getParent()))
5418 continue;
5419 return true;
5420 }
5421 return false;
5422 };
5423
5424 while (BBsToErase.remove_if(HasRemainingUses)) {
5425 // Try again if anything was removed.
5426 }
5427
5428 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5429 DeleteDeadBlocks(BBVec);
5430}
5431
5432CanonicalLoopInfo *
5433OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5434 InsertPointTy ComputeIP) {
5435 assert(Loops.size() >= 1 && "At least one loop required");
5436 size_t NumLoops = Loops.size();
5437
5438 // Nothing to do if there is already just one loop.
5439 if (NumLoops == 1)
5440 return Loops.front();
5441
5442 CanonicalLoopInfo *Outermost = Loops.front();
5443 CanonicalLoopInfo *Innermost = Loops.back();
5444 BasicBlock *OrigPreheader = Outermost->getPreheader();
5445 BasicBlock *OrigAfter = Outermost->getAfter();
5446 Function *F = OrigPreheader->getParent();
5447
5448 // Loop control blocks that may become orphaned later.
5449 SmallVector<BasicBlock *, 12> OldControlBBs;
5450 OldControlBBs.reserve(6 * Loops.size());
5451 for (CanonicalLoopInfo *Loop : Loops)
5452 Loop->collectControlBlocks(OldControlBBs);
5453
5454 // Setup the IRBuilder for inserting the trip count computation.
5455 Builder.SetCurrentDebugLocation(DL);
5456 if (ComputeIP.isSet())
5457 Builder.restoreIP(ComputeIP);
5458 else
5459 Builder.restoreIP(Outermost->getPreheaderIP());
5460
5461 // Derive the collapsed' loop trip count.
5462 // TODO: Find common/largest indvar type.
5463 Value *CollapsedTripCount = nullptr;
5464 for (CanonicalLoopInfo *L : Loops) {
5465 assert(L->isValid() &&
5466 "All loops to collapse must be valid canonical loops");
5467 Value *OrigTripCount = L->getTripCount();
5468 if (!CollapsedTripCount) {
5469 CollapsedTripCount = OrigTripCount;
5470 continue;
5471 }
5472
5473 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5474 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5475 }
5476
5477 // Create the collapsed loop control flow.
5478 CanonicalLoopInfo *Result =
5479 createLoopSkeleton(DL, CollapsedTripCount, F,
5480 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5481
5482 // Build the collapsed loop body code.
5483 // Start with deriving the input loop induction variables from the collapsed
5484 // one, using a divmod scheme. To preserve the original loops' order, the
5485 // innermost loop use the least significant bits.
5486 Builder.restoreIP(Result->getBodyIP());
5487
5488 Value *Leftover = Result->getIndVar();
5489 SmallVector<Value *> NewIndVars;
5490 NewIndVars.resize(NumLoops);
5491 for (int i = NumLoops - 1; i >= 1; --i) {
5492 Value *OrigTripCount = Loops[i]->getTripCount();
5493
5494 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5495 NewIndVars[i] = NewIndVar;
5496
5497 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5498 }
5499 // Outermost loop gets all the remaining bits.
5500 NewIndVars[0] = Leftover;
5501
5502 // Construct the loop body control flow.
5503 // We progressively construct the branch structure following in direction of
5504 // the control flow, from the leading in-between code, the loop nest body, the
5505 // trailing in-between code, and rejoining the collapsed loop's latch.
5506 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5507 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5508 // its predecessors as sources.
5509 BasicBlock *ContinueBlock = Result->getBody();
5510 BasicBlock *ContinuePred = nullptr;
5511 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5512 BasicBlock *NextSrc) {
5513 if (ContinueBlock)
5514 redirectTo(ContinueBlock, Dest, DL);
5515 else
5516 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5517
5518 ContinueBlock = nullptr;
5519 ContinuePred = NextSrc;
5520 };
5521
5522 // The code before the nested loop of each level.
5523 // Because we are sinking it into the nest, it will be executed more often
5524 // that the original loop. More sophisticated schemes could keep track of what
5525 // the in-between code is and instantiate it only once per thread.
5526 for (size_t i = 0; i < NumLoops - 1; ++i)
5527 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5528
5529 // Connect the loop nest body.
5530 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5531
5532 // The code after the nested loop at each level.
5533 for (size_t i = NumLoops - 1; i > 0; --i)
5534 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5535
5536 // Connect the finished loop to the collapsed loop latch.
5537 ContinueWith(Result->getLatch(), nullptr);
5538
5539 // Replace the input loops with the new collapsed loop.
5540 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5541 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5542
5543 // Replace the input loop indvars with the derived ones.
5544 for (size_t i = 0; i < NumLoops; ++i)
5545 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5546
5547 // Remove unused parts of the input loops.
5548 removeUnusedBlocksFromParent(OldControlBBs);
5549
5550 for (CanonicalLoopInfo *L : Loops)
5551 L->invalidate();
5552
5553#ifndef NDEBUG
5554 Result->assertOK();
5555#endif
5556 return Result;
5557}
5558
5559std::vector<CanonicalLoopInfo *>
5560OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5561 ArrayRef<Value *> TileSizes) {
5562 assert(TileSizes.size() == Loops.size() &&
5563 "Must pass as many tile sizes as there are loops");
5564 int NumLoops = Loops.size();
5565 assert(NumLoops >= 1 && "At least one loop to tile required");
5566
5567 CanonicalLoopInfo *OutermostLoop = Loops.front();
5568 CanonicalLoopInfo *InnermostLoop = Loops.back();
5569 Function *F = OutermostLoop->getBody()->getParent();
5570 BasicBlock *InnerEnter = InnermostLoop->getBody();
5571 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5572
5573 // Loop control blocks that may become orphaned later.
5574 SmallVector<BasicBlock *, 12> OldControlBBs;
5575 OldControlBBs.reserve(6 * Loops.size());
5576 for (CanonicalLoopInfo *Loop : Loops)
5577 Loop->collectControlBlocks(OldControlBBs);
5578
5579 // Collect original trip counts and induction variable to be accessible by
5580 // index. Also, the structure of the original loops is not preserved during
5581 // the construction of the tiled loops, so do it before we scavenge the BBs of
5582 // any original CanonicalLoopInfo.
5583 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5584 for (CanonicalLoopInfo *L : Loops) {
5585 assert(L->isValid() && "All input loops must be valid canonical loops");
5586 OrigTripCounts.push_back(L->getTripCount());
5587 OrigIndVars.push_back(L->getIndVar());
5588 }
5589
5590 // Collect the code between loop headers. These may contain SSA definitions
5591 // that are used in the loop nest body. To be usable with in the innermost
5592 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5593 // these instructions may be executed more often than before the tiling.
5594 // TODO: It would be sufficient to only sink them into body of the
5595 // corresponding tile loop.
5597 for (int i = 0; i < NumLoops - 1; ++i) {
5598 CanonicalLoopInfo *Surrounding = Loops[i];
5599 CanonicalLoopInfo *Nested = Loops[i + 1];
5600
5601 BasicBlock *EnterBB = Surrounding->getBody();
5602 BasicBlock *ExitBB = Nested->getHeader();
5603 InbetweenCode.emplace_back(EnterBB, ExitBB);
5604 }
5605
5606 // Compute the trip counts of the floor loops.
5607 Builder.SetCurrentDebugLocation(DL);
5608 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5609 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5610 for (int i = 0; i < NumLoops; ++i) {
5611 Value *TileSize = TileSizes[i];
5612 Value *OrigTripCount = OrigTripCounts[i];
5613 Type *IVType = OrigTripCount->getType();
5614
5615 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5616 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5617
5618 // 0 if tripcount divides the tilesize, 1 otherwise.
5619 // 1 means we need an additional iteration for a partial tile.
5620 //
5621 // Unfortunately we cannot just use the roundup-formula
5622 // (tripcount + tilesize - 1)/tilesize
5623 // because the summation might overflow. We do not want introduce undefined
5624 // behavior when the untiled loop nest did not.
5625 Value *FloorTripOverflow =
5626 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5627
5628 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5629 Value *FloorTripCount =
5630 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5631 "omp_floor" + Twine(i) + ".tripcount", true);
5632
5633 // Remember some values for later use.
5634 FloorCompleteCount.push_back(FloorCompleteTripCount);
5635 FloorCount.push_back(FloorTripCount);
5636 FloorRems.push_back(FloorTripRem);
5637 }
5638
5639 // Generate the new loop nest, from the outermost to the innermost.
5640 std::vector<CanonicalLoopInfo *> Result;
5641 Result.reserve(NumLoops * 2);
5642
5643 // The basic block of the surrounding loop that enters the nest generated
5644 // loop.
5645 BasicBlock *Enter = OutermostLoop->getPreheader();
5646
5647 // The basic block of the surrounding loop where the inner code should
5648 // continue.
5649 BasicBlock *Continue = OutermostLoop->getAfter();
5650
5651 // Where the next loop basic block should be inserted.
5652 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5653
5654 auto EmbeddNewLoop =
5655 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5656 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5657 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5658 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5659 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5660 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5661
5662 // Setup the position where the next embedded loop connects to this loop.
5663 Enter = EmbeddedLoop->getBody();
5664 Continue = EmbeddedLoop->getLatch();
5665 OutroInsertBefore = EmbeddedLoop->getLatch();
5666 return EmbeddedLoop;
5667 };
5668
5669 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5670 const Twine &NameBase) {
5671 for (auto P : enumerate(TripCounts)) {
5672 CanonicalLoopInfo *EmbeddedLoop =
5673 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5674 Result.push_back(EmbeddedLoop);
5675 }
5676 };
5677
5678 EmbeddNewLoops(FloorCount, "floor");
5679
5680 // Within the innermost floor loop, emit the code that computes the tile
5681 // sizes.
5682 Builder.SetInsertPoint(Enter->getTerminator());
5683 SmallVector<Value *, 4> TileCounts;
5684 for (int i = 0; i < NumLoops; ++i) {
5685 CanonicalLoopInfo *FloorLoop = Result[i];
5686 Value *TileSize = TileSizes[i];
5687
5688 Value *FloorIsEpilogue =
5689 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5690 Value *TileTripCount =
5691 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5692
5693 TileCounts.push_back(TileTripCount);
5694 }
5695
5696 // Create the tile loops.
5697 EmbeddNewLoops(TileCounts, "tile");
5698
5699 // Insert the inbetween code into the body.
5700 BasicBlock *BodyEnter = Enter;
5701 BasicBlock *BodyEntered = nullptr;
5702 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5703 BasicBlock *EnterBB = P.first;
5704 BasicBlock *ExitBB = P.second;
5705
5706 if (BodyEnter)
5707 redirectTo(BodyEnter, EnterBB, DL);
5708 else
5709 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5710
5711 BodyEnter = nullptr;
5712 BodyEntered = ExitBB;
5713 }
5714
5715 // Append the original loop nest body into the generated loop nest body.
5716 if (BodyEnter)
5717 redirectTo(BodyEnter, InnerEnter, DL);
5718 else
5719 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5721
5722 // Replace the original induction variable with an induction variable computed
5723 // from the tile and floor induction variables.
5724 Builder.restoreIP(Result.back()->getBodyIP());
5725 for (int i = 0; i < NumLoops; ++i) {
5726 CanonicalLoopInfo *FloorLoop = Result[i];
5727 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5728 Value *OrigIndVar = OrigIndVars[i];
5729 Value *Size = TileSizes[i];
5730
5731 Value *Scale =
5732 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5733 Value *Shift =
5734 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5735 OrigIndVar->replaceAllUsesWith(Shift);
5736 }
5737
5738 // Remove unused parts of the original loops.
5739 removeUnusedBlocksFromParent(OldControlBBs);
5740
5741 for (CanonicalLoopInfo *L : Loops)
5742 L->invalidate();
5743
5744#ifndef NDEBUG
5745 for (CanonicalLoopInfo *GenL : Result)
5746 GenL->assertOK();
5747#endif
5748 return Result;
5749}
5750
5751/// Attach metadata \p Properties to the basic block described by \p BB. If the
5752/// basic block already has metadata, the basic block properties are appended.
5754 ArrayRef<Metadata *> Properties) {
5755 // Nothing to do if no property to attach.
5756 if (Properties.empty())
5757 return;
5758
5759 LLVMContext &Ctx = BB->getContext();
5760 SmallVector<Metadata *> NewProperties;
5761 NewProperties.push_back(nullptr);
5762
5763 // If the basic block already has metadata, prepend it to the new metadata.
5764 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5765 if (Existing)
5766 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5767
5768 append_range(NewProperties, Properties);
5769 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5770 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5771
5772 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5773}
5774
5775/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5776/// loop already has metadata, the loop properties are appended.
5777static void addLoopMetadata(CanonicalLoopInfo *Loop,
5778 ArrayRef<Metadata *> Properties) {
5779 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5780
5781 // Attach metadata to the loop's latch
5782 BasicBlock *Latch = Loop->getLatch();
5783 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5784 addBasicBlockMetadata(Latch, Properties);
5785}
5786
5787/// Attach llvm.access.group metadata to the memref instructions of \p Block
5788static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5789 LoopInfo &LI) {
5790 for (Instruction &I : *Block) {
5791 if (I.mayReadOrWriteMemory()) {
5792 // TODO: This instruction may already have access group from
5793 // other pragmas e.g. #pragma clang loop vectorize. Append
5794 // so that the existing metadata is not overwritten.
5795 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5796 }
5797 }
5798}
5799
5800void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5801 LLVMContext &Ctx = Builder.getContext();
5803 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5804 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5805}
5806
5807void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5808 LLVMContext &Ctx = Builder.getContext();
5810 Loop, {
5811 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5812 });
5813}
5814
5815void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5816 Value *IfCond, ValueToValueMapTy &VMap,
5817 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5818 const Twine &NamePrefix) {
5819 Function *F = CanonicalLoop->getFunction();
5820
5821 // We can't do
5822 // if (cond) {
5823 // simd_loop;
5824 // } else {
5825 // non_simd_loop;
5826 // }
5827 // because then the CanonicalLoopInfo would only point to one of the loops:
5828 // leading to other constructs operating on the same loop to malfunction.
5829 // Instead generate
5830 // while (...) {
5831 // if (cond) {
5832 // simd_body;
5833 // } else {
5834 // not_simd_body;
5835 // }
5836 // }
5837 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5838 // body at -O3
5839
5840 // Define where if branch should be inserted
5841 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5842
5843 // Create additional blocks for the if statement
5844 BasicBlock *Cond = SplitBeforeIt->getParent();
5845 llvm::LLVMContext &C = Cond->getContext();
5847 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5849 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5850
5851 // Create if condition branch.
5852 Builder.SetInsertPoint(SplitBeforeIt);
5853 Instruction *BrInstr =
5854 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5855 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5856 // Then block contains branch to omp loop body which needs to be vectorized
5857 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5858 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5859
5860 Builder.SetInsertPoint(ElseBlock);
5861
5862 // Clone loop for the else branch
5864
5865 SmallVector<BasicBlock *, 8> ExistingBlocks;
5866 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5867 ExistingBlocks.push_back(ThenBlock);
5868 ExistingBlocks.append(L->block_begin(), L->block_end());
5869 // Cond is the block that has the if clause condition
5870 // LoopCond is omp_loop.cond
5871 // LoopHeader is omp_loop.header
5872 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5873 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5874 assert(LoopCond && LoopHeader && "Invalid loop structure");
5875 for (BasicBlock *Block : ExistingBlocks) {
5876 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5877 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5878 continue;
5879 }
5880 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5881
5882 // fix name not to be omp.if.then
5883 if (Block == ThenBlock)
5884 NewBB->setName(NamePrefix + ".if.else");
5885
5886 NewBB->moveBefore(CanonicalLoop->getExit());
5887 VMap[Block] = NewBB;
5888 NewBlocks.push_back(NewBB);
5889 }
5890 remapInstructionsInBlocks(NewBlocks, VMap);
5891 Builder.CreateBr(NewBlocks.front());
5892
5893 // The loop latch must have only one predecessor. Currently it is branched to
5894 // from both the 'then' and 'else' branches.
5895 L->getLoopLatch()->splitBasicBlock(
5896 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5897
5898 // Ensure that the then block is added to the loop so we add the attributes in
5899 // the next step
5900 L->addBasicBlockToLoop(ThenBlock, LI);
5901}
5902
5903unsigned
5904OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5905 const StringMap<bool> &Features) {
5906 if (TargetTriple.isX86()) {
5907 if (Features.lookup("avx512f"))
5908 return 512;
5909 else if (Features.lookup("avx"))
5910 return 256;
5911 return 128;
5912 }
5913 if (TargetTriple.isPPC())
5914 return 128;
5915 if (TargetTriple.isWasm())
5916 return 128;
5917 return 0;
5918}
5919
5920void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5921 MapVector<Value *, Value *> AlignedVars,
5922 Value *IfCond, OrderKind Order,
5923 ConstantInt *Simdlen, ConstantInt *Safelen) {
5924 LLVMContext &Ctx = Builder.getContext();
5925
5926 Function *F = CanonicalLoop->getFunction();
5927
5928 // TODO: We should not rely on pass manager. Currently we use pass manager
5929 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5930 // object. We should have a method which returns all blocks between
5931 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5933 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5934 FAM.registerPass([]() { return LoopAnalysis(); });
5935 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5936
5937 LoopAnalysis LIA;
5938 LoopInfo &&LI = LIA.run(*F, FAM);
5939
5940 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5941 if (AlignedVars.size()) {
5942 InsertPointTy IP = Builder.saveIP();
5943 for (auto &AlignedItem : AlignedVars) {
5944 Value *AlignedPtr = AlignedItem.first;
5945 Value *Alignment = AlignedItem.second;
5946 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5947 Builder.SetInsertPoint(loadInst->getNextNode());
5948 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5949 Alignment);
5950 }
5951 Builder.restoreIP(IP);
5952 }
5953
5954 if (IfCond) {
5955 ValueToValueMapTy VMap;
5956 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5957 }
5958
5960
5961 // Get the basic blocks from the loop in which memref instructions
5962 // can be found.
5963 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5964 // preferably without running any passes.
5965 for (BasicBlock *Block : L->getBlocks()) {
5966 if (Block == CanonicalLoop->getCond() ||
5967 Block == CanonicalLoop->getHeader())
5968 continue;
5969 Reachable.insert(Block);
5970 }
5971
5972 SmallVector<Metadata *> LoopMDList;
5973
5974 // In presence of finite 'safelen', it may be unsafe to mark all
5975 // the memory instructions parallel, because loop-carried
5976 // dependences of 'safelen' iterations are possible.
5977 // If clause order(concurrent) is specified then the memory instructions
5978 // are marked parallel even if 'safelen' is finite.
5979 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5980 // Add access group metadata to memory-access instructions.
5981 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5982 for (BasicBlock *BB : Reachable)
5983 addSimdMetadata(BB, AccessGroup, LI);
5984 // TODO: If the loop has existing parallel access metadata, have
5985 // to combine two lists.
5986 LoopMDList.push_back(MDNode::get(
5987 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5988 }
5989
5990 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5991 // versions so we can't add the loop attributes in that case.
5992 if (IfCond) {
5993 // we can still add llvm.loop.parallel_access
5994 addLoopMetadata(CanonicalLoop, LoopMDList);
5995 return;
5996 }
5997
5998 // Use the above access group metadata to create loop level
5999 // metadata, which should be distinct for each loop.
6000 ConstantAsMetadata *BoolConst =
6002 LoopMDList.push_back(MDNode::get(
6003 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6004
6005 if (Simdlen || Safelen) {
6006 // If both simdlen and safelen clauses are specified, the value of the
6007 // simdlen parameter must be less than or equal to the value of the safelen
6008 // parameter. Therefore, use safelen only in the absence of simdlen.
6009 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6010 LoopMDList.push_back(
6011 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6012 ConstantAsMetadata::get(VectorizeWidth)}));
6013 }
6014
6015 addLoopMetadata(CanonicalLoop, LoopMDList);
6016}
6017
6018/// Create the TargetMachine object to query the backend for optimization
6019/// preferences.
6020///
6021/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6022/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6023/// needed for the LLVM pass pipline. We use some default options to avoid
6024/// having to pass too many settings from the frontend that probably do not
6025/// matter.
6026///
6027/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6028/// method. If we are going to use TargetMachine for more purposes, especially
6029/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6030/// might become be worth requiring front-ends to pass on their TargetMachine,
6031/// or at least cache it between methods. Note that while fontends such as Clang
6032/// have just a single main TargetMachine per translation unit, "target-cpu" and
6033/// "target-features" that determine the TargetMachine are per-function and can
6034/// be overrided using __attribute__((target("OPTIONS"))).
6035static std::unique_ptr<TargetMachine>
6037 Module *M = F->getParent();
6038
6039 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6040 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6041 const llvm::Triple &Triple = M->getTargetTriple();
6042
6043 std::string Error;
6045 if (!TheTarget)
6046 return {};
6047
6049 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6050 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6051 /*CodeModel=*/std::nullopt, OptLevel));
6052}
6053
6054/// Heuristically determine the best-performant unroll factor for \p CLI. This
6055/// depends on the target processor. We are re-using the same heuristics as the
6056/// LoopUnrollPass.
6057static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6058 Function *F = CLI->getFunction();
6059
6060 // Assume the user requests the most aggressive unrolling, even if the rest of
6061 // the code is optimized using a lower setting.
6063 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6064
6066 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6067 FAM.registerPass([]() { return AssumptionAnalysis(); });
6068 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6069 FAM.registerPass([]() { return LoopAnalysis(); });
6070 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6071 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6072 TargetIRAnalysis TIRA;
6073 if (TM)
6074 TIRA = TargetIRAnalysis(
6075 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6076 FAM.registerPass([&]() { return TIRA; });
6077
6078 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6080 ScalarEvolution &&SE = SEA.run(*F, FAM);
6082 DominatorTree &&DT = DTA.run(*F, FAM);
6083 LoopAnalysis LIA;
6084 LoopInfo &&LI = LIA.run(*F, FAM);
6086 AssumptionCache &&AC = ACT.run(*F, FAM);
6088
6089 Loop *L = LI.getLoopFor(CLI->getHeader());
6090 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6091
6093 L, SE, TTI,
6094 /*BlockFrequencyInfo=*/nullptr,
6095 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6096 /*UserThreshold=*/std::nullopt,
6097 /*UserCount=*/std::nullopt,
6098 /*UserAllowPartial=*/true,
6099 /*UserAllowRuntime=*/true,
6100 /*UserUpperBound=*/std::nullopt,
6101 /*UserFullUnrollMaxCount=*/std::nullopt);
6102
6103 UP.Force = true;
6104
6105 // Account for additional optimizations taking place before the LoopUnrollPass
6106 // would unroll the loop.
6109
6110 // Use normal unroll factors even if the rest of the code is optimized for
6111 // size.
6114
6115 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6116 << " Threshold=" << UP.Threshold << "\n"
6117 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6118 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6119 << " PartialOptSizeThreshold="
6120 << UP.PartialOptSizeThreshold << "\n");
6121
6122 // Disable peeling.
6125 /*UserAllowPeeling=*/false,
6126 /*UserAllowProfileBasedPeeling=*/false,
6127 /*UnrollingSpecficValues=*/false);
6128
6130 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6131
6132 // Assume that reads and writes to stack variables can be eliminated by
6133 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6134 // size.
6135 for (BasicBlock *BB : L->blocks()) {
6136 for (Instruction &I : *BB) {
6137 Value *Ptr;
6138 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6139 Ptr = Load->getPointerOperand();
6140 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6141 Ptr = Store->getPointerOperand();
6142 } else
6143 continue;
6144
6145 Ptr = Ptr->stripPointerCasts();
6146
6147 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6148 if (Alloca->getParent() == &F->getEntryBlock())
6149 EphValues.insert(&I);
6150 }
6151 }
6152 }
6153
6154 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6155
6156 // Loop is not unrollable if the loop contains certain instructions.
6157 if (!UCE.canUnroll()) {
6158 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6159 return 1;
6160 }
6161
6162 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6163 << "\n");
6164
6165 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6166 // be able to use it.
6167 int TripCount = 0;
6168 int MaxTripCount = 0;
6169 bool MaxOrZero = false;
6170 unsigned TripMultiple = 0;
6171
6172 bool UseUpperBound = false;
6173 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6174 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6175 UseUpperBound);
6176 unsigned Factor = UP.Count;
6177 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6178
6179 // This function returns 1 to signal to not unroll a loop.
6180 if (Factor == 0)
6181 return 1;
6182 return Factor;
6183}
6184
6185void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6186 int32_t Factor,
6187 CanonicalLoopInfo **UnrolledCLI) {
6188 assert(Factor >= 0 && "Unroll factor must not be negative");
6189
6190 Function *F = Loop->getFunction();
6191 LLVMContext &Ctx = F->getContext();
6192
6193 // If the unrolled loop is not used for another loop-associated directive, it
6194 // is sufficient to add metadata for the LoopUnrollPass.
6195 if (!UnrolledCLI) {
6196 SmallVector<Metadata *, 2> LoopMetadata;
6197 LoopMetadata.push_back(
6198 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6199
6200 if (Factor >= 1) {
6202 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6203 LoopMetadata.push_back(MDNode::get(
6204 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6205 }
6206
6207 addLoopMetadata(Loop, LoopMetadata);
6208 return;
6209 }
6210
6211 // Heuristically determine the unroll factor.
6212 if (Factor == 0)
6214
6215 // No change required with unroll factor 1.
6216 if (Factor == 1) {
6217 *UnrolledCLI = Loop;
6218 return;
6219 }
6220
6221 assert(Factor >= 2 &&
6222 "unrolling only makes sense with a factor of 2 or larger");
6223
6224 Type *IndVarTy = Loop->getIndVarType();
6225
6226 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6227 // unroll the inner loop.
6228 Value *FactorVal =
6229 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6230 /*isSigned=*/false));
6231 std::vector<CanonicalLoopInfo *> LoopNest =
6232 tileLoops(DL, {Loop}, {FactorVal});
6233 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6234 *UnrolledCLI = LoopNest[0];
6235 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6236
6237 // LoopUnrollPass can only fully unroll loops with constant trip count.
6238 // Unroll by the unroll factor with a fallback epilog for the remainder
6239 // iterations if necessary.
6241 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6243 InnerLoop,
6244 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6246 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6247
6248#ifndef NDEBUG
6249 (*UnrolledCLI)->assertOK();
6250#endif
6251}
6252
6253OpenMPIRBuilder::InsertPointTy
6254OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6255 llvm::Value *BufSize, llvm::Value *CpyBuf,
6256 llvm::Value *CpyFn, llvm::Value *DidIt) {
6257 if (!updateToLocation(Loc))
6258 return Loc.IP;
6259
6260 uint32_t SrcLocStrSize;
6261 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6262 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6263 Value *ThreadId = getOrCreateThreadID(Ident);
6264
6265 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6266
6267 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6268
6269 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6270 Builder.CreateCall(Fn, Args);
6271
6272 return Builder.saveIP();
6273}
6274
6275OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6276 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6277 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6279
6280 if (!updateToLocation(Loc))
6281 return Loc.IP;
6282
6283 // If needed allocate and initialize `DidIt` with 0.
6284 // DidIt: flag variable: 1=single thread; 0=not single thread.
6285 llvm::Value *DidIt = nullptr;
6286 if (!CPVars.empty()) {
6287 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6288 Builder.CreateStore(Builder.getInt32(0), DidIt);
6289 }
6290
6291 Directive OMPD = Directive::OMPD_single;
6292 uint32_t SrcLocStrSize;
6293 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6294 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6295 Value *ThreadId = getOrCreateThreadID(Ident);
6296 Value *Args[] = {Ident, ThreadId};
6297
6298 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6299 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6300
6301 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6302 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6303
6304 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6305 if (Error Err = FiniCB(IP))
6306 return Err;
6307
6308 // The thread that executes the single region must set `DidIt` to 1.
6309 // This is used by __kmpc_copyprivate, to know if the caller is the
6310 // single thread or not.
6311 if (DidIt)
6312 Builder.CreateStore(Builder.getInt32(1), DidIt);
6313
6314 return Error::success();
6315 };
6316
6317 // generates the following:
6318 // if (__kmpc_single()) {
6319 // .... single region ...
6320 // __kmpc_end_single
6321 // }
6322 // __kmpc_copyprivate
6323 // __kmpc_barrier
6324
6325 InsertPointOrErrorTy AfterIP =
6326 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6327 /*Conditional*/ true,
6328 /*hasFinalize*/ true);
6329 if (!AfterIP)
6330 return AfterIP.takeError();
6331
6332 if (DidIt) {
6333 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6334 // NOTE BufSize is currently unused, so just pass 0.
6335 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6336 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6337 CPFuncs[I], DidIt);
6338 // NOTE __kmpc_copyprivate already inserts a barrier
6339 } else if (!IsNowait) {
6340 InsertPointOrErrorTy AfterIP =
6341 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6342 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6343 /* CheckCancelFlag */ false);
6344 if (!AfterIP)
6345 return AfterIP.takeError();
6346 }
6347 return Builder.saveIP();
6348}
6349
6350OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6351 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6352 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6353
6354 if (!updateToLocation(Loc))
6355 return Loc.IP;
6356
6357 Directive OMPD = Directive::OMPD_critical;
6358 uint32_t SrcLocStrSize;
6359 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6360 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6361 Value *ThreadId = getOrCreateThreadID(Ident);
6362 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6363 Value *Args[] = {Ident, ThreadId, LockVar};
6364
6365 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6366 Function *RTFn = nullptr;
6367 if (HintInst) {
6368 // Add Hint to entry Args and create call
6369 EnterArgs.push_back(HintInst);
6370 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6371 } else {
6372 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6373 }
6374 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6375
6376 Function *ExitRTLFn =
6377 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6378 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6379
6380 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6381 /*Conditional*/ false, /*hasFinalize*/ true);
6382}
6383
6384OpenMPIRBuilder::InsertPointTy
6385OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6386 InsertPointTy AllocaIP, unsigned NumLoops,
6387 ArrayRef<llvm::Value *> StoreValues,
6388 const Twine &Name, bool IsDependSource) {
6389 assert(
6390 llvm::all_of(StoreValues,
6391 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6392 "OpenMP runtime requires depend vec with i64 type");
6393
6394 if (!updateToLocation(Loc))
6395 return Loc.IP;
6396
6397 // Allocate space for vector and generate alloc instruction.
6398 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6399 Builder.restoreIP(AllocaIP);
6400 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6401 ArgsBase->setAlignment(Align(8));
6402 updateToLocation(Loc);
6403
6404 // Store the index value with offset in depend vector.
6405 for (unsigned I = 0; I < NumLoops; ++I) {
6406 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6407 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6408 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6409 STInst->setAlignment(Align(8));
6410 }
6411
6412 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6413 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6414
6415 uint32_t SrcLocStrSize;
6416 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6417 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6418 Value *ThreadId = getOrCreateThreadID(Ident);
6419 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6420
6421 Function *RTLFn = nullptr;
6422 if (IsDependSource)
6423 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6424 else
6425 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6426 Builder.CreateCall(RTLFn, Args);
6427
6428 return Builder.saveIP();
6429}
6430
6431OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6432 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6433 FinalizeCallbackTy FiniCB, bool IsThreads) {
6434 if (!updateToLocation(Loc))
6435 return Loc.IP;
6436
6437 Directive OMPD = Directive::OMPD_ordered;
6438 Instruction *EntryCall = nullptr;
6439 Instruction *ExitCall = nullptr;
6440
6441 if (IsThreads) {
6442 uint32_t SrcLocStrSize;
6443 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6444 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6445 Value *ThreadId = getOrCreateThreadID(Ident);
6446 Value *Args[] = {Ident, ThreadId};
6447
6448 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6449 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6450
6451 Function *ExitRTLFn =
6452 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6453 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6454 }
6455
6456 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6457 /*Conditional*/ false, /*hasFinalize*/ true);
6458}
6459
6460OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6461 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6462 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6463 bool HasFinalize, bool IsCancellable) {
6464
6465 if (HasFinalize)
6466 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6467
6468 // Create inlined region's entry and body blocks, in preparation
6469 // for conditional creation
6470 BasicBlock *EntryBB = Builder.GetInsertBlock();
6471 Instruction *SplitPos = EntryBB->getTerminator();
6472 if (!isa_and_nonnull<BranchInst>(SplitPos))
6473 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6474 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6475 BasicBlock *FiniBB =
6476 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6477
6478 Builder.SetInsertPoint(EntryBB->getTerminator());
6479 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6480
6481 // generate body
6482 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6483 /* CodeGenIP */ Builder.saveIP()))
6484 return Err;
6485
6486 // emit exit call and do any needed finalization.
6487 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6488 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6489 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6490 "Unexpected control flow graph state!!");
6491 InsertPointOrErrorTy AfterIP =
6492 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6493 if (!AfterIP)
6494 return AfterIP.takeError();
6495 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6496 "Unexpected Control Flow State!");
6498
6499 // If we are skipping the region of a non conditional, remove the exit
6500 // block, and clear the builder's insertion point.
6501 assert(SplitPos->getParent() == ExitBB &&
6502 "Unexpected Insertion point location!");
6503 auto merged = MergeBlockIntoPredecessor(ExitBB);
6504 BasicBlock *ExitPredBB = SplitPos->getParent();
6505 auto InsertBB = merged ? ExitPredBB : ExitBB;
6506 if (!isa_and_nonnull<BranchInst>(SplitPos))
6507 SplitPos->eraseFromParent();
6508 Builder.SetInsertPoint(InsertBB);
6509
6510 return Builder.saveIP();
6511}
6512
6513OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6514 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6515 // if nothing to do, Return current insertion point.
6516 if (!Conditional || !EntryCall)
6517 return Builder.saveIP();
6518
6519 BasicBlock *EntryBB = Builder.GetInsertBlock();
6520 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6521 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6522 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6523
6524 // Emit thenBB and set the Builder's insertion point there for
6525 // body generation next. Place the block after the current block.
6526 Function *CurFn = EntryBB->getParent();
6527 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6528
6529 // Move Entry branch to end of ThenBB, and replace with conditional
6530 // branch (If-stmt)
6531 Instruction *EntryBBTI = EntryBB->getTerminator();
6532 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6533 EntryBBTI->removeFromParent();
6534 Builder.SetInsertPoint(UI);
6535 Builder.Insert(EntryBBTI);
6536 UI->eraseFromParent();
6537 Builder.SetInsertPoint(ThenBB->getTerminator());
6538
6539 // return an insertion point to ExitBB.
6540 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6541}
6542
6543OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6544 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6545 bool HasFinalize) {
6546
6547 Builder.restoreIP(FinIP);
6548
6549 // If there is finalization to do, emit it before the exit call
6550 if (HasFinalize) {
6551 assert(!FinalizationStack.empty() &&
6552 "Unexpected finalization stack state!");
6553
6554 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6555 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6556
6557 if (Error Err = Fi.FiniCB(FinIP))
6558 return Err;
6559
6560 BasicBlock *FiniBB = FinIP.getBlock();
6561 Instruction *FiniBBTI = FiniBB->getTerminator();
6562
6563 // set Builder IP for call creation
6564 Builder.SetInsertPoint(FiniBBTI);
6565 }
6566
6567 if (!ExitCall)
6568 return Builder.saveIP();
6569
6570 // place the Exitcall as last instruction before Finalization block terminator
6571 ExitCall->removeFromParent();
6572 Builder.Insert(ExitCall);
6573
6574 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6575 ExitCall->getIterator());
6576}
6577
6578OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6579 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6580 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6581 if (!IP.isSet())
6582 return IP;
6583
6584 IRBuilder<>::InsertPointGuard IPG(Builder);
6585
6586 // creates the following CFG structure
6587 // OMP_Entry : (MasterAddr != PrivateAddr)?
6588 // F T
6589 // | \
6590 // | copin.not.master
6591 // | /
6592 // v /
6593 // copyin.not.master.end
6594 // |
6595 // v
6596 // OMP.Entry.Next
6597
6598 BasicBlock *OMP_Entry = IP.getBlock();
6599 Function *CurFn = OMP_Entry->getParent();
6600 BasicBlock *CopyBegin =
6601 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6602 BasicBlock *CopyEnd = nullptr;
6603
6604 // If entry block is terminated, split to preserve the branch to following
6605 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6606 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6607 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6608 "copyin.not.master.end");
6609 OMP_Entry->getTerminator()->eraseFromParent();
6610 } else {
6611 CopyEnd =
6612 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6613 }
6614
6615 Builder.SetInsertPoint(OMP_Entry);
6616 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6617 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6618 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6619 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6620
6621 Builder.SetInsertPoint(CopyBegin);
6622 if (BranchtoEnd)
6623 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6624
6625 return Builder.saveIP();
6626}
6627
6628CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6630 std::string Name) {
6631 IRBuilder<>::InsertPointGuard IPG(Builder);
6632 updateToLocation(Loc);
6633
6634 uint32_t SrcLocStrSize;
6635 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6636 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6637 Value *ThreadId = getOrCreateThreadID(Ident);
6638 Value *Args[] = {ThreadId, Size, Allocator};
6639
6640 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6641
6642 return Builder.CreateCall(Fn, Args, Name);
6643}
6644
6645CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6646 Value *Addr, Value *Allocator,
6647 std::string Name) {
6648 IRBuilder<>::InsertPointGuard IPG(Builder);
6649 updateToLocation(Loc);
6650
6651 uint32_t SrcLocStrSize;
6652 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6653 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6654 Value *ThreadId = getOrCreateThreadID(Ident);
6655 Value *Args[] = {ThreadId, Addr, Allocator};
6656 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6657 return Builder.CreateCall(Fn, Args, Name);
6658}
6659
6660CallInst *OpenMPIRBuilder::createOMPInteropInit(
6661 const LocationDescription &Loc, Value *InteropVar,
6662 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6663 Value *DependenceAddress, bool HaveNowaitClause) {
6664 IRBuilder<>::InsertPointGuard IPG(Builder);
6665 updateToLocation(Loc);
6666
6667 uint32_t SrcLocStrSize;
6668 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6669 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6670 Value *ThreadId = getOrCreateThreadID(Ident);
6671 if (Device == nullptr)
6673 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6674 if (NumDependences == nullptr) {
6675 NumDependences = ConstantInt::get(Int32, 0);
6676 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6677 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6678 }
6679 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6680 Value *Args[] = {
6681 Ident, ThreadId, InteropVar, InteropTypeVal,
6682 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6683
6684 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6685
6686 return Builder.CreateCall(Fn, Args);
6687}
6688
6689CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6690 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6691 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6692 IRBuilder<>::InsertPointGuard IPG(Builder);
6693 updateToLocation(Loc);
6694
6695 uint32_t SrcLocStrSize;
6696 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6697 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6698 Value *ThreadId = getOrCreateThreadID(Ident);
6699 if (Device == nullptr)
6701 if (NumDependences == nullptr) {
6702 NumDependences = ConstantInt::get(Int32, 0);
6703 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6704 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6705 }
6706 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6707 Value *Args[] = {
6708 Ident, ThreadId, InteropVar, Device,
6709 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6710
6711 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6712
6713 return Builder.CreateCall(Fn, Args);
6714}
6715
6716CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6717 Value *InteropVar, Value *Device,
6718 Value *NumDependences,
6719 Value *DependenceAddress,
6720 bool HaveNowaitClause) {
6721 IRBuilder<>::InsertPointGuard IPG(Builder);
6722 updateToLocation(Loc);
6723 uint32_t SrcLocStrSize;
6724 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6725 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6726 Value *ThreadId = getOrCreateThreadID(Ident);
6727 if (Device == nullptr)
6729 if (NumDependences == nullptr) {
6730 NumDependences = ConstantInt::get(Int32, 0);
6731 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6732 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6733 }
6734 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6735 Value *Args[] = {
6736 Ident, ThreadId, InteropVar, Device,
6737 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6738
6739 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6740
6741 return Builder.CreateCall(Fn, Args);
6742}
6743
6744CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6745 const LocationDescription &Loc, llvm::Value *Pointer,
6746 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6747 IRBuilder<>::InsertPointGuard IPG(Builder);
6748 updateToLocation(Loc);
6749
6750 uint32_t SrcLocStrSize;
6751 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6752 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6753 Value *ThreadId = getOrCreateThreadID(Ident);
6754 Constant *ThreadPrivateCache =
6755 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6756 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6757
6758 Function *Fn =
6759 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6760
6761 return Builder.CreateCall(Fn, Args);
6762}
6763
6764OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6765 const LocationDescription &Loc,
6766 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6767 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6768 "expected num_threads and num_teams to be specified");
6769
6770 if (!updateToLocation(Loc))
6771 return Loc.IP;
6772
6773 uint32_t SrcLocStrSize;
6774 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6775 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6776 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6777 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6778 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6779 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6780 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6781
6782 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6783 Function *Kernel = DebugKernelWrapper;
6784
6785 // We need to strip the debug prefix to get the correct kernel name.
6786 StringRef KernelName = Kernel->getName();
6787 const std::string DebugPrefix = "_debug__";
6788 if (KernelName.ends_with(DebugPrefix)) {
6789 KernelName = KernelName.drop_back(DebugPrefix.length());
6790 Kernel = M.getFunction(KernelName);
6791 assert(Kernel && "Expected the real kernel to exist");
6792 }
6793
6794 // Manifest the launch configuration in the metadata matching the kernel
6795 // environment.
6796 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6797 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6798
6799 // If MaxThreads not set, select the maximum between the default workgroup
6800 // size and the MinThreads value.
6801 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6802 if (MaxThreadsVal < 0)
6803 MaxThreadsVal = std::max(
6804 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6805
6806 if (MaxThreadsVal > 0)
6807 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6808
6809 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6811 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6812 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6813 Constant *ReductionDataSize =
6814 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6815 Constant *ReductionBufferLength =
6816 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6817
6818 Function *Fn = getOrCreateRuntimeFunctionPtr(
6819 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6820 const DataLayout &DL = Fn->getDataLayout();
6821
6822 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6823 Constant *DynamicEnvironmentInitializer =
6824 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6825 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6826 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6827 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6828 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6829 DL.getDefaultGlobalsAddressSpace());
6830 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6831
6832 Constant *DynamicEnvironment =
6833 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6834 ? DynamicEnvironmentGV
6835 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6836 DynamicEnvironmentPtr);
6837
6838 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6839 ConfigurationEnvironment, {
6840 UseGenericStateMachineVal,
6841 MayUseNestedParallelismVal,
6842 IsSPMDVal,
6843 MinThreads,
6844 MaxThreads,
6845 MinTeams,
6846 MaxTeams,
6847 ReductionDataSize,
6848 ReductionBufferLength,
6849 });
6850 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6851 KernelEnvironment, {
6852 ConfigurationEnvironmentInitializer,
6853 Ident,
6854 DynamicEnvironment,
6855 });
6856 std::string KernelEnvironmentName =
6857 (KernelName + "_kernel_environment").str();
6858 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6859 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6860 KernelEnvironmentInitializer, KernelEnvironmentName,
6861 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6862 DL.getDefaultGlobalsAddressSpace());
6863 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6864
6865 Constant *KernelEnvironment =
6866 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6867 ? KernelEnvironmentGV
6868 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6869 KernelEnvironmentPtr);
6870 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6871 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6872 KernelLaunchEnvironment =
6873 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6874 ? KernelLaunchEnvironment
6875 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6876 KernelLaunchEnvParamTy);
6877 CallInst *ThreadKind =
6878 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6879
6880 Value *ExecUserCode = Builder.CreateICmpEQ(
6881 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6882 "exec_user_code");
6883
6884 // ThreadKind = __kmpc_target_init(...)
6885 // if (ThreadKind == -1)
6886 // user_code
6887 // else
6888 // return;
6889
6890 auto *UI = Builder.CreateUnreachable();
6891 BasicBlock *CheckBB = UI->getParent();
6892 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6893
6894 BasicBlock *WorkerExitBB = BasicBlock::Create(
6895 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6896 Builder.SetInsertPoint(WorkerExitBB);
6897 Builder.CreateRetVoid();
6898
6899 auto *CheckBBTI = CheckBB->getTerminator();
6900 Builder.SetInsertPoint(CheckBBTI);
6901 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6902
6903 CheckBBTI->eraseFromParent();
6904 UI->eraseFromParent();
6905
6906 // Continue in the "user_code" block, see diagram above and in
6907 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6908 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6909}
6910
6911void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6912 int32_t TeamsReductionDataSize,
6913 int32_t TeamsReductionBufferLength) {
6914 if (!updateToLocation(Loc))
6915 return;
6916
6917 Function *Fn = getOrCreateRuntimeFunctionPtr(
6918 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6919
6920 Builder.CreateCall(Fn, {});
6921
6922 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6923 return;
6924
6925 Function *Kernel = Builder.GetInsertBlock()->getParent();
6926 // We need to strip the debug prefix to get the correct kernel name.
6927 StringRef KernelName = Kernel->getName();
6928 const std::string DebugPrefix = "_debug__";
6929 if (KernelName.ends_with(DebugPrefix))
6930 KernelName = KernelName.drop_back(DebugPrefix.length());
6931 auto *KernelEnvironmentGV =
6932 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6933 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6934 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6935 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6936 KernelEnvironmentInitializer,
6937 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6938 NewInitializer = ConstantFoldInsertValueInstruction(
6939 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6940 {0, 8});
6941 KernelEnvironmentGV->setInitializer(NewInitializer);
6942}
6943
6944static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6945 bool Min) {
6946 if (Kernel.hasFnAttribute(Name)) {
6947 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6948 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6949 }
6950 Kernel.addFnAttr(Name, llvm::utostr(Value));
6951}
6952
6953std::pair<int32_t, int32_t>
6954OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6955 int32_t ThreadLimit =
6956 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6957
6958 if (T.isAMDGPU()) {
6959 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6960 if (!Attr.isValid() || !Attr.isStringAttribute())
6961 return {0, ThreadLimit};
6962 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6963 int32_t LB, UB;
6964 if (!llvm::to_integer(UBStr, UB, 10))
6965 return {0, ThreadLimit};
6966 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6967 if (!llvm::to_integer(LBStr, LB, 10))
6968 return {0, UB};
6969 return {LB, UB};
6970 }
6971
6972 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6973 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6974 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6975 }
6976 return {0, ThreadLimit};
6977}
6978
6979void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6980 Function &Kernel, int32_t LB,
6981 int32_t UB) {
6982 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6983
6984 if (T.isAMDGPU()) {
6985 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6986 llvm::utostr(LB) + "," + llvm::utostr(UB));
6987 return;
6988 }
6989
6990 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6991}
6992
6993std::pair<int32_t, int32_t>
6994OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6995 // TODO: Read from backend annotations if available.
6996 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6997}
6998
6999void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7000 int32_t LB, int32_t UB) {
7001 if (T.isNVPTX())
7002 if (UB > 0)
7003 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7004 if (T.isAMDGPU())
7005 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7006
7007 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7008}
7009
7010void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7011 Function *OutlinedFn) {
7012 if (Config.isTargetDevice()) {
7014 // TODO: Determine if DSO local can be set to true.
7015 OutlinedFn->setDSOLocal(false);
7017 if (T.isAMDGCN())
7019 else if (T.isNVPTX())
7021 else if (T.isSPIRV())
7023 }
7024}
7025
7026Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7027 StringRef EntryFnIDName) {
7028 if (Config.isTargetDevice()) {
7029 assert(OutlinedFn && "The outlined function must exist if embedded");
7030 return OutlinedFn;
7031 }
7032
7033 return new GlobalVariable(
7034 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7035 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7036}
7037
7038Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7039 StringRef EntryFnName) {
7040 if (OutlinedFn)
7041 return OutlinedFn;
7042
7043 assert(!M.getGlobalVariable(EntryFnName, true) &&
7044 "Named kernel already exists?");
7045 return new GlobalVariable(
7046 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7047 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7048}
7049
7050Error OpenMPIRBuilder::emitTargetRegionFunction(
7051 TargetRegionEntryInfo &EntryInfo,
7052 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7053 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7054
7055 SmallString<64> EntryFnName;
7056 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7057
7058 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7059 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7060 if (!CBResult)
7061 return CBResult.takeError();
7062 OutlinedFn = *CBResult;
7063 } else {
7064 OutlinedFn = nullptr;
7065 }
7066
7067 // If this target outline function is not an offload entry, we don't need to
7068 // register it. This may be in the case of a false if clause, or if there are
7069 // no OpenMP targets.
7070 if (!IsOffloadEntry)
7071 return Error::success();
7072
7073 std::string EntryFnIDName =
7074 Config.isTargetDevice()
7075 ? std::string(EntryFnName)
7076 : createPlatformSpecificName({EntryFnName, "region_id"});
7077
7078 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7079 EntryFnName, EntryFnIDName);
7080 return Error::success();
7081}
7082
7083Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7084 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7085 StringRef EntryFnName, StringRef EntryFnIDName) {
7086 if (OutlinedFn)
7087 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7088 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7089 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7090 OffloadInfoManager.registerTargetRegionEntryInfo(
7091 EntryInfo, EntryAddr, OutlinedFnID,
7092 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7093 return OutlinedFnID;
7094}
7095
7096OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7097 const LocationDescription &Loc, InsertPointTy AllocaIP,
7098 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7099 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7100 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7101 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7102 BodyGenTy BodyGenType)>
7103 BodyGenCB,
7104 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7105 if (!updateToLocation(Loc))
7106 return InsertPointTy();
7107
7108 Builder.restoreIP(CodeGenIP);
7109 // Disable TargetData CodeGen on Device pass.
7110 if (Config.IsTargetDevice.value_or(false)) {
7111 if (BodyGenCB) {
7112 InsertPointOrErrorTy AfterIP =
7113 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7114 if (!AfterIP)
7115 return AfterIP.takeError();
7116 Builder.restoreIP(*AfterIP);
7117 }
7118 return Builder.saveIP();
7119 }
7120
7121 bool IsStandAlone = !BodyGenCB;
7122 MapInfosTy *MapInfo;
7123 // Generate the code for the opening of the data environment. Capture all the
7124 // arguments of the runtime call by reference because they are used in the
7125 // closing of the region.
7126 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7127 InsertPointTy CodeGenIP) -> Error {
7128 MapInfo = &GenMapInfoCB(Builder.saveIP());
7129 if (Error Err = emitOffloadingArrays(
7130 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7131 /*IsNonContiguous=*/true, DeviceAddrCB))
7132 return Err;
7133
7134 TargetDataRTArgs RTArgs;
7135 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7136
7137 // Emit the number of elements in the offloading arrays.
7138 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7139
7140 // Source location for the ident struct
7141 if (!SrcLocInfo) {
7142 uint32_t SrcLocStrSize;
7143 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7144 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7145 }
7146
7147 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7148 SrcLocInfo, DeviceID,
7149 PointerNum, RTArgs.BasePointersArray,
7150 RTArgs.PointersArray, RTArgs.SizesArray,
7151 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7152 RTArgs.MappersArray};
7153
7154 if (IsStandAlone) {
7155 assert(MapperFunc && "MapperFunc missing for standalone target data");
7156
7157 auto TaskBodyCB = [&](Value *, Value *,
7159 if (Info.HasNoWait) {
7160 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7164 }
7165
7166 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7167 OffloadingArgs);
7168
7169 if (Info.HasNoWait) {
7170 BasicBlock *OffloadContBlock =
7171 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7172 Function *CurFn = Builder.GetInsertBlock()->getParent();
7173 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7174 Builder.restoreIP(Builder.saveIP());
7175 }
7176 return Error::success();
7177 };
7178
7179 bool RequiresOuterTargetTask = Info.HasNoWait;
7180 if (!RequiresOuterTargetTask)
7181 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7182 /*TargetTaskAllocaIP=*/{}));
7183 else
7184 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7185 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7186 } else {
7187 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7188 omp::OMPRTL___tgt_target_data_begin_mapper);
7189
7190 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7191
7192 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7193 if (isa<AllocaInst>(DeviceMap.second.second)) {
7194 auto *LI =
7195 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7196 Builder.CreateStore(LI, DeviceMap.second.second);
7197 }
7198 }
7199
7200 // If device pointer privatization is required, emit the body of the
7201 // region here. It will have to be duplicated: with and without
7202 // privatization.
7203 InsertPointOrErrorTy AfterIP =
7204 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7205 if (!AfterIP)
7206 return AfterIP.takeError();
7207 Builder.restoreIP(*AfterIP);
7208 }
7209 return Error::success();
7210 };
7211
7212 // If we need device pointer privatization, we need to emit the body of the
7213 // region with no privatization in the 'else' branch of the conditional.
7214 // Otherwise, we don't have to do anything.
7215 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7216 InsertPointTy CodeGenIP) -> Error {
7217 InsertPointOrErrorTy AfterIP =
7218 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7219 if (!AfterIP)
7220 return AfterIP.takeError();
7221 Builder.restoreIP(*AfterIP);
7222 return Error::success();
7223 };
7224
7225 // Generate code for the closing of the data region.
7226 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7227 TargetDataRTArgs RTArgs;
7228 Info.EmitDebug = !MapInfo->Names.empty();
7229 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7230
7231 // Emit the number of elements in the offloading arrays.
7232 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7233
7234 // Source location for the ident struct
7235 if (!SrcLocInfo) {
7236 uint32_t SrcLocStrSize;
7237 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7238 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7239 }
7240
7241 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7242 PointerNum, RTArgs.BasePointersArray,
7243 RTArgs.PointersArray, RTArgs.SizesArray,
7244 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7245 RTArgs.MappersArray};
7246 Function *EndMapperFunc =
7247 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7248
7249 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7250 return Error::success();
7251 };
7252
7253 // We don't have to do anything to close the region if the if clause evaluates
7254 // to false.
7255 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7256 return Error::success();
7257 };
7258
7259 Error Err = [&]() -> Error {
7260 if (BodyGenCB) {
7261 Error Err = [&]() {
7262 if (IfCond)
7263 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7264 return BeginThenGen(AllocaIP, Builder.saveIP());
7265 }();
7266
7267 if (Err)
7268 return Err;
7269
7270 // If we don't require privatization of device pointers, we emit the body
7271 // in between the runtime calls. This avoids duplicating the body code.
7272 InsertPointOrErrorTy AfterIP =
7273 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7274 if (!AfterIP)
7275 return AfterIP.takeError();
7276 restoreIPandDebugLoc(Builder, *AfterIP);
7277
7278 if (IfCond)
7279 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7280 return EndThenGen(AllocaIP, Builder.saveIP());
7281 }
7282 if (IfCond)
7283 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7284 return BeginThenGen(AllocaIP, Builder.saveIP());
7285 }();
7286
7287 if (Err)
7288 return Err;
7289
7290 return Builder.saveIP();
7291}
7292
7294OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7295 bool IsGPUDistribute) {
7296 assert((IVSize == 32 || IVSize == 64) &&
7297 "IV size is not compatible with the omp runtime");
7299 if (IsGPUDistribute)
7300 Name = IVSize == 32
7301 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7302 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7303 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7304 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7305 else
7306 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7307 : omp::OMPRTL___kmpc_for_static_init_4u)
7308 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7309 : omp::OMPRTL___kmpc_for_static_init_8u);
7310
7311 return getOrCreateRuntimeFunction(M, Name);
7312}
7313
7314FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7315 bool IVSigned) {
7316 assert((IVSize == 32 || IVSize == 64) &&
7317 "IV size is not compatible with the omp runtime");
7318 RuntimeFunction Name = IVSize == 32
7319 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7320 : omp::OMPRTL___kmpc_dispatch_init_4u)
7321 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7322 : omp::OMPRTL___kmpc_dispatch_init_8u);
7323
7324 return getOrCreateRuntimeFunction(M, Name);
7325}
7326
7327FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7328 bool IVSigned) {
7329 assert((IVSize == 32 || IVSize == 64) &&
7330 "IV size is not compatible with the omp runtime");
7331 RuntimeFunction Name = IVSize == 32
7332 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7333 : omp::OMPRTL___kmpc_dispatch_next_4u)
7334 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7335 : omp::OMPRTL___kmpc_dispatch_next_8u);
7336
7337 return getOrCreateRuntimeFunction(M, Name);
7338}
7339
7340FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7341 bool IVSigned) {
7342 assert((IVSize == 32 || IVSize == 64) &&
7343 "IV size is not compatible with the omp runtime");
7344 RuntimeFunction Name = IVSize == 32
7345 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7346 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7347 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7348 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7349
7350 return getOrCreateRuntimeFunction(M, Name);
7351}
7352
7353FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7354 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7355}
7356
7358 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7359 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7360
7361 DISubprogram *NewSP = Func->getSubprogram();
7362 if (!NewSP)
7363 return;
7364
7366
7367 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7368 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7369 // Only use cached variable if the arg number matches. This is important
7370 // so that DIVariable created for privatized variables are not discarded.
7371 if (NewVar && (arg == NewVar->getArg()))
7372 return NewVar;
7373
7375 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7376 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7377 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7378 return NewVar;
7379 };
7380
7381 auto UpdateDebugRecord = [&](auto *DR) {
7382 DILocalVariable *OldVar = DR->getVariable();
7383 unsigned ArgNo = 0;
7384 for (auto Loc : DR->location_ops()) {
7385 auto Iter = ValueReplacementMap.find(Loc);
7386 if (Iter != ValueReplacementMap.end()) {
7387 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7388 ArgNo = std::get<1>(Iter->second) + 1;
7389 }
7390 }
7391 if (ArgNo != 0)
7392 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7393 };
7394
7395 // The location and scope of variable intrinsics and records still point to
7396 // the parent function of the target region. Update them.
7397 for (Instruction &I : instructions(Func)) {
7399 "Unexpected debug intrinsic");
7400 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7401 UpdateDebugRecord(&DVR);
7402 }
7403 // An extra argument is passed to the device. Create the debug data for it.
7404 if (OMPBuilder.Config.isTargetDevice()) {
7405 DICompileUnit *CU = NewSP->getUnit();
7406 Module *M = Func->getParent();
7407 DIBuilder DB(*M, true, CU);
7408 DIType *VoidPtrTy =
7409 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7410 DILocalVariable *Var = DB.createParameterVariable(
7411 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7412 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7413 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7414 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7415 &(*Func->begin()));
7416 }
7417}
7418
7420 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7421 return cast<Operator>(V)->getOperand(0);
7422 return V;
7423}
7424
7426 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7427 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7428 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7429 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7430 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7431 SmallVector<Type *> ParameterTypes;
7432 if (OMPBuilder.Config.isTargetDevice()) {
7433 // Add the "implicit" runtime argument we use to provide launch specific
7434 // information for target devices.
7435 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7436 ParameterTypes.push_back(Int8PtrTy);
7437
7438 // All parameters to target devices are passed as pointers
7439 // or i64. This assumes 64-bit address spaces/pointers.
7440 for (auto &Arg : Inputs)
7441 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7442 ? Arg->getType()
7443 : Type::getInt64Ty(Builder.getContext()));
7444 } else {
7445 for (auto &Arg : Inputs)
7446 ParameterTypes.push_back(Arg->getType());
7447 }
7448
7449 auto BB = Builder.GetInsertBlock();
7450 auto M = BB->getModule();
7451 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7452 /*isVarArg*/ false);
7453 auto Func =
7454 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7455
7456 // Forward target-cpu and target-features function attributes from the
7457 // original function to the new outlined function.
7458 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7459
7460 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7461 if (TargetCpuAttr.isStringAttribute())
7462 Func->addFnAttr(TargetCpuAttr);
7463
7464 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7465 if (TargetFeaturesAttr.isStringAttribute())
7466 Func->addFnAttr(TargetFeaturesAttr);
7467
7468 if (OMPBuilder.Config.isTargetDevice()) {
7469 Value *ExecMode =
7470 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7471 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7472 }
7473
7474 // Save insert point.
7475 IRBuilder<>::InsertPointGuard IPG(Builder);
7476 // We will generate the entries in the outlined function but the debug
7477 // location may still be pointing to the parent function. Reset it now.
7478 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7479
7480 // Generate the region into the function.
7481 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7482 Builder.SetInsertPoint(EntryBB);
7483
7484 // Insert target init call in the device compilation pass.
7485 if (OMPBuilder.Config.isTargetDevice())
7486 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7487
7488 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7489
7490 // As we embed the user code in the middle of our target region after we
7491 // generate entry code, we must move what allocas we can into the entry
7492 // block to avoid possible breaking optimisations for device
7493 if (OMPBuilder.Config.isTargetDevice())
7494 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7495
7496 // Insert target deinit call in the device compilation pass.
7497 BasicBlock *OutlinedBodyBB =
7498 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7499 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7500 Builder.saveIP(),
7501 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7502 if (!AfterIP)
7503 return AfterIP.takeError();
7504 Builder.restoreIP(*AfterIP);
7505 if (OMPBuilder.Config.isTargetDevice())
7506 OMPBuilder.createTargetDeinit(Builder);
7507
7508 // Insert return instruction.
7509 Builder.CreateRetVoid();
7510
7511 // New Alloca IP at entry point of created device function.
7512 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7513 auto AllocaIP = Builder.saveIP();
7514
7515 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7516
7517 // Skip the artificial dyn_ptr on the device.
7518 const auto &ArgRange =
7519 OMPBuilder.Config.isTargetDevice()
7520 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7521 : Func->args();
7522
7524
7525 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7526 // Things like GEP's can come in the form of Constants. Constants and
7527 // ConstantExpr's do not have access to the knowledge of what they're
7528 // contained in, so we must dig a little to find an instruction so we
7529 // can tell if they're used inside of the function we're outlining. We
7530 // also replace the original constant expression with a new instruction
7531 // equivalent; an instruction as it allows easy modification in the
7532 // following loop, as we can now know the constant (instruction) is
7533 // owned by our target function and replaceUsesOfWith can now be invoked
7534 // on it (cannot do this with constants it seems). A brand new one also
7535 // allows us to be cautious as it is perhaps possible the old expression
7536 // was used inside of the function but exists and is used externally
7537 // (unlikely by the nature of a Constant, but still).
7538 // NOTE: We cannot remove dead constants that have been rewritten to
7539 // instructions at this stage, we run the risk of breaking later lowering
7540 // by doing so as we could still be in the process of lowering the module
7541 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7542 // constants we have created rewritten versions of.
7543 if (auto *Const = dyn_cast<Constant>(Input))
7544 convertUsersOfConstantsToInstructions(Const, Func, false);
7545
7546 // Collect users before iterating over them to avoid invalidating the
7547 // iteration in case a user uses Input more than once (e.g. a call
7548 // instruction).
7549 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7550 // Collect all the instructions
7552 if (auto *Instr = dyn_cast<Instruction>(User))
7553 if (Instr->getFunction() == Func)
7554 Instr->replaceUsesOfWith(Input, InputCopy);
7555 };
7556
7557 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7558
7559 // Rewrite uses of input valus to parameters.
7560 for (auto InArg : zip(Inputs, ArgRange)) {
7561 Value *Input = std::get<0>(InArg);
7562 Argument &Arg = std::get<1>(InArg);
7563 Value *InputCopy = nullptr;
7564
7565 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7566 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7567 if (!AfterIP)
7568 return AfterIP.takeError();
7569 Builder.restoreIP(*AfterIP);
7570 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7571
7572 // In certain cases a Global may be set up for replacement, however, this
7573 // Global may be used in multiple arguments to the kernel, just segmented
7574 // apart, for example, if we have a global array, that is sectioned into
7575 // multiple mappings (technically not legal in OpenMP, but there is a case
7576 // in Fortran for Common Blocks where this is neccesary), we will end up
7577 // with GEP's into this array inside the kernel, that refer to the Global
7578 // but are technically seperate arguments to the kernel for all intents and
7579 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7580 // index, it will fold into an referal to the Global, if we then encounter
7581 // this folded GEP during replacement all of the references to the
7582 // Global in the kernel will be replaced with the argument we have generated
7583 // that corresponds to it, including any other GEP's that refer to the
7584 // Global that may be other arguments. This will invalidate all of the other
7585 // preceding mapped arguments that refer to the same global that may be
7586 // seperate segments. To prevent this, we defer global processing until all
7587 // other processing has been performed.
7590 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7591 continue;
7592 }
7593
7595 continue;
7596
7597 ReplaceValue(Input, InputCopy, Func);
7598 }
7599
7600 // Replace all of our deferred Input values, currently just Globals.
7601 for (auto Deferred : DeferredReplacement)
7602 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7603
7604 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7605 ValueReplacementMap);
7606 return Func;
7607}
7608/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7609/// of pointers containing shared data between the parent task and the created
7610/// task.
7611static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7612 IRBuilderBase &Builder,
7613 Value *TaskWithPrivates,
7614 Type *TaskWithPrivatesTy) {
7615
7616 Type *TaskTy = OMPIRBuilder.Task;
7617 LLVMContext &Ctx = Builder.getContext();
7618 Value *TaskT =
7619 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7620 Value *Shareds = TaskT;
7621 // TaskWithPrivatesTy can be one of the following
7622 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7623 // %struct.privates }
7624 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7625 //
7626 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7627 // its first member has to be the task descriptor. TaskTy is the type of the
7628 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7629 // first member of TaskT, gives us the pointer to shared data.
7630 if (TaskWithPrivatesTy != TaskTy)
7631 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7632 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7633}
7634/// Create an entry point for a target task with the following.
7635/// It'll have the following signature
7636/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7637/// This function is called from emitTargetTask once the
7638/// code to launch the target kernel has been outlined already.
7639/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7640/// into the task structure so that the deferred target task can access this
7641/// data even after the stack frame of the generating task has been rolled
7642/// back. Offloading arrays contain base pointers, pointers, sizes etc
7643/// of the data that the target kernel will access. These in effect are the
7644/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7646 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7647 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7648 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7649
7650 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7651 // This is because PrivatesTy is the type of the structure in which
7652 // we pass the offloading arrays to the deferred target task.
7653 assert((!NumOffloadingArrays || PrivatesTy) &&
7654 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7655 "to privatize");
7656
7657 Module &M = OMPBuilder.M;
7658 // KernelLaunchFunction is the target launch function, i.e.
7659 // the function that sets up kernel arguments and calls
7660 // __tgt_target_kernel to launch the kernel on the device.
7661 //
7662 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7663
7664 // StaleCI is the CallInst which is the call to the outlined
7665 // target kernel launch function. If there are local live-in values
7666 // that the outlined function uses then these are aggregated into a structure
7667 // which is passed as the second argument. If there are no local live-in
7668 // values or if all values used by the outlined kernel are global variables,
7669 // then there's only one argument, the threadID. So, StaleCI can be
7670 //
7671 // %structArg = alloca { ptr, ptr }, align 8
7672 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7673 // store ptr %20, ptr %gep_, align 8
7674 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7675 // store ptr %21, ptr %gep_8, align 8
7676 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7677 //
7678 // OR
7679 //
7680 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7681 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7682 StaleCI->getIterator());
7683
7684 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7685
7686 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7687 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7688 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7689
7690 auto ProxyFnTy =
7691 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7692 /* isVarArg */ false);
7693 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7694 ".omp_target_task_proxy_func",
7695 Builder.GetInsertBlock()->getModule());
7696 Value *ThreadId = ProxyFn->getArg(0);
7697 Value *TaskWithPrivates = ProxyFn->getArg(1);
7698 ThreadId->setName("thread.id");
7699 TaskWithPrivates->setName("task");
7700
7701 bool HasShareds = SharedArgsOperandNo > 0;
7702 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7703 BasicBlock *EntryBB =
7704 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7705 Builder.SetInsertPoint(EntryBB);
7706
7707 SmallVector<Value *> KernelLaunchArgs;
7708 KernelLaunchArgs.reserve(StaleCI->arg_size());
7709 KernelLaunchArgs.push_back(ThreadId);
7710
7711 if (HasOffloadingArrays) {
7712 assert(TaskTy != TaskWithPrivatesTy &&
7713 "If there are offloading arrays to pass to the target"
7714 "TaskTy cannot be the same as TaskWithPrivatesTy");
7715 (void)TaskTy;
7716 Value *Privates =
7717 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7718 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7719 KernelLaunchArgs.push_back(
7720 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7721 }
7722
7723 if (HasShareds) {
7724 auto *ArgStructAlloca =
7725 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7726 assert(ArgStructAlloca &&
7727 "Unable to find the alloca instruction corresponding to arguments "
7728 "for extracted function");
7729 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7730
7731 AllocaInst *NewArgStructAlloca =
7732 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7733
7734 Value *SharedsSize =
7735 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7736
7738 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7739
7740 Builder.CreateMemCpy(
7741 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7742 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7743 KernelLaunchArgs.push_back(NewArgStructAlloca);
7744 }
7745 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7746 Builder.CreateRetVoid();
7747 return ProxyFn;
7748}
7750
7751 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7752 return GEP->getSourceElementType();
7753 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7754 return Alloca->getAllocatedType();
7755
7756 llvm_unreachable("Unhandled Instruction type");
7757 return nullptr;
7758}
7759// This function returns a struct that has at most two members.
7760// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7761// descriptor. The second member, if needed, is a struct containing arrays
7762// that need to be passed to the offloaded target kernel. For example,
7763// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7764// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7765// respectively, then the types created by this function are
7766//
7767// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7768// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7769// %struct.privates }
7770// %struct.task_with_privates is returned by this function.
7771// If there aren't any offloading arrays to pass to the target kernel,
7772// %struct.kmp_task_ompbuilder_t is returned.
7773static StructType *
7774createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7775 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7776
7777 if (OffloadingArraysToPrivatize.empty())
7778 return OMPIRBuilder.Task;
7779
7780 SmallVector<Type *, 4> StructFieldTypes;
7781 for (Value *V : OffloadingArraysToPrivatize) {
7782 assert(V->getType()->isPointerTy() &&
7783 "Expected pointer to array to privatize. Got a non-pointer value "
7784 "instead");
7785 Type *ArrayTy = getOffloadingArrayType(V);
7786 assert(ArrayTy && "ArrayType cannot be nullptr");
7787 StructFieldTypes.push_back(ArrayTy);
7788 }
7789 StructType *PrivatesStructTy =
7790 StructType::create(StructFieldTypes, "struct.privates");
7791 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7792 "struct.task_with_privates");
7793}
7795 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7796 TargetRegionEntryInfo &EntryInfo,
7797 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7798 Function *&OutlinedFn, Constant *&OutlinedFnID,
7800 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7801 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7802
7803 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7804 [&](StringRef EntryFnName) {
7805 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7806 EntryFnName, Inputs, CBFunc,
7807 ArgAccessorFuncCB);
7808 };
7809
7810 return OMPBuilder.emitTargetRegionFunction(
7811 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7812 OutlinedFnID);
7813}
7814
7815OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7816 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7817 OpenMPIRBuilder::InsertPointTy AllocaIP,
7819 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7820
7821 // The following explains the code-gen scenario for the `target` directive. A
7822 // similar scneario is followed for other device-related directives (e.g.
7823 // `target enter data`) but in similar fashion since we only need to emit task
7824 // that encapsulates the proper runtime call.
7825 //
7826 // When we arrive at this function, the target region itself has been
7827 // outlined into the function OutlinedFn.
7828 // So at ths point, for
7829 // --------------------------------------------------------------
7830 // void user_code_that_offloads(...) {
7831 // omp target depend(..) map(from:a) map(to:b) private(i)
7832 // do i = 1, 10
7833 // a(i) = b(i) + n
7834 // }
7835 //
7836 // --------------------------------------------------------------
7837 //
7838 // we have
7839 //
7840 // --------------------------------------------------------------
7841 //
7842 // void user_code_that_offloads(...) {
7843 // %.offload_baseptrs = alloca [2 x ptr], align 8
7844 // %.offload_ptrs = alloca [2 x ptr], align 8
7845 // %.offload_mappers = alloca [2 x ptr], align 8
7846 // ;; target region has been outlined and now we need to
7847 // ;; offload to it via a target task.
7848 // }
7849 // void outlined_device_function(ptr a, ptr b, ptr n) {
7850 // n = *n_ptr;
7851 // do i = 1, 10
7852 // a(i) = b(i) + n
7853 // }
7854 //
7855 // We have to now do the following
7856 // (i) Make an offloading call to outlined_device_function using the OpenMP
7857 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7858 // emitted by emitKernelLaunch
7859 // (ii) Create a task entry point function that calls kernel_launch_function
7860 // and is the entry point for the target task. See
7861 // '@.omp_target_task_proxy_func in the pseudocode below.
7862 // (iii) Create a task with the task entry point created in (ii)
7863 //
7864 // That is we create the following
7865 // struct task_with_privates {
7866 // struct kmp_task_ompbuilder_t task_struct;
7867 // struct privates {
7868 // [2 x ptr] ; baseptrs
7869 // [2 x ptr] ; ptrs
7870 // [2 x i64] ; sizes
7871 // }
7872 // }
7873 // void user_code_that_offloads(...) {
7874 // %.offload_baseptrs = alloca [2 x ptr], align 8
7875 // %.offload_ptrs = alloca [2 x ptr], align 8
7876 // %.offload_sizes = alloca [2 x i64], align 8
7877 //
7878 // %structArg = alloca { ptr, ptr, ptr }, align 8
7879 // %strucArg[0] = a
7880 // %strucArg[1] = b
7881 // %strucArg[2] = &n
7882 //
7883 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7884 // sizeof(kmp_task_ompbuilder_t),
7885 // sizeof(structArg),
7886 // @.omp_target_task_proxy_func,
7887 // ...)
7888 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7889 // sizeof(structArg))
7890 // memcpy(target_task_with_privates->privates->baseptrs,
7891 // offload_baseptrs, sizeof(offload_baseptrs)
7892 // memcpy(target_task_with_privates->privates->ptrs,
7893 // offload_ptrs, sizeof(offload_ptrs)
7894 // memcpy(target_task_with_privates->privates->sizes,
7895 // offload_sizes, sizeof(offload_sizes)
7896 // dependencies_array = ...
7897 // ;; if nowait not present
7898 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7899 // call @__kmpc_omp_task_begin_if0(...)
7900 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7901 // %target_task_with_privates)
7902 // call @__kmpc_omp_task_complete_if0(...)
7903 // }
7904 //
7905 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7906 // ptr %task) {
7907 // %structArg = alloca {ptr, ptr, ptr}
7908 // %task_ptr = getelementptr(%task, 0, 0)
7909 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7910 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7911 //
7912 // %offloading_arrays = getelementptr(%task, 0, 1)
7913 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7914 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7915 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7916 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7917 // %offload_sizes, %structArg)
7918 // }
7919 //
7920 // We need the proxy function because the signature of the task entry point
7921 // expected by kmpc_omp_task is always the same and will be different from
7922 // that of the kernel_launch function.
7923 //
7924 // kernel_launch_function is generated by emitKernelLaunch and has the
7925 // always_inline attribute. For this example, it'll look like so:
7926 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7927 // %offload_sizes, %structArg) alwaysinline {
7928 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7929 // ; load aggregated data from %structArg
7930 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7931 // ; offload_sizes
7932 // call i32 @__tgt_target_kernel(...,
7933 // outlined_device_function,
7934 // ptr %kernel_args)
7935 // }
7936 // void outlined_device_function(ptr a, ptr b, ptr n) {
7937 // n = *n_ptr;
7938 // do i = 1, 10
7939 // a(i) = b(i) + n
7940 // }
7941 //
7942 BasicBlock *TargetTaskBodyBB =
7943 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7944 BasicBlock *TargetTaskAllocaBB =
7945 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7946
7947 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7948 TargetTaskAllocaBB->begin());
7949 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7950
7951 OutlineInfo OI;
7952 OI.EntryBB = TargetTaskAllocaBB;
7953 OI.OuterAllocaBB = AllocaIP.getBlock();
7954
7955 // Add the thread ID argument.
7957 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7958 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7959
7960 // Generate the task body which will subsequently be outlined.
7961 Builder.restoreIP(TargetTaskBodyIP);
7962 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7963 return Err;
7964
7965 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7966 // it is given. These blocks are enumerated by
7967 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7968 // to be outside the region. In other words, OI.ExitBlock is expected to be
7969 // the start of the region after the outlining. We used to set OI.ExitBlock
7970 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7971 // except when the task body is a single basic block. In that case,
7972 // OI.ExitBlock is set to the single task body block and will get left out of
7973 // the outlining process. So, simply create a new empty block to which we
7974 // uncoditionally branch from where TaskBodyCB left off
7975 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7976 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7977 /*IsFinished=*/true);
7978
7979 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7980 bool NeedsTargetTask = HasNoWait && DeviceID;
7981 if (NeedsTargetTask) {
7982 for (auto *V :
7983 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7984 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7985 RTArgs.SizesArray}) {
7987 OffloadingArraysToPrivatize.push_back(V);
7988 OI.ExcludeArgsFromAggregate.push_back(V);
7989 }
7990 }
7991 }
7992 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7993 DeviceID, OffloadingArraysToPrivatize](
7994 Function &OutlinedFn) mutable {
7995 assert(OutlinedFn.hasOneUse() &&
7996 "there must be a single user for the outlined function");
7997
7998 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7999
8000 // The first argument of StaleCI is always the thread id.
8001 // The next few arguments are the pointers to offloading arrays
8002 // if any. (see OffloadingArraysToPrivatize)
8003 // Finally, all other local values that are live-in into the outlined region
8004 // end up in a structure whose pointer is passed as the last argument. This
8005 // piece of data is passed in the "shared" field of the task structure. So,
8006 // we know we have to pass shareds to the task if the number of arguments is
8007 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8008 // thread id. Further, for safety, we assert that the number of arguments of
8009 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8010 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8011 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8012 assert((!HasShareds ||
8013 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8014 "Wrong number of arguments for StaleCI when shareds are present");
8015 int SharedArgOperandNo =
8016 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8017
8018 StructType *TaskWithPrivatesTy =
8019 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8020 StructType *PrivatesTy = nullptr;
8021
8022 if (!OffloadingArraysToPrivatize.empty())
8023 PrivatesTy =
8024 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8025
8027 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8028 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8029
8030 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8031 << "\n");
8032
8033 Builder.SetInsertPoint(StaleCI);
8034
8035 // Gather the arguments for emitting the runtime call.
8036 uint32_t SrcLocStrSize;
8037 Constant *SrcLocStr =
8038 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8039 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8040
8041 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8042 //
8043 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8044 // the DeviceID to the deferred task and also since
8045 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8046 Function *TaskAllocFn =
8047 !NeedsTargetTask
8048 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8049 : getOrCreateRuntimeFunctionPtr(
8050 OMPRTL___kmpc_omp_target_task_alloc);
8051
8052 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8053 // call.
8054 Value *ThreadID = getOrCreateThreadID(Ident);
8055
8056 // Argument - `sizeof_kmp_task_t` (TaskSize)
8057 // Tasksize refers to the size in bytes of kmp_task_t data structure
8058 // plus any other data to be passed to the target task, if any, which
8059 // is packed into a struct. kmp_task_t and the struct so created are
8060 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8061 Value *TaskSize = Builder.getInt64(
8062 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8063
8064 // Argument - `sizeof_shareds` (SharedsSize)
8065 // SharedsSize refers to the shareds array size in the kmp_task_t data
8066 // structure.
8067 Value *SharedsSize = Builder.getInt64(0);
8068 if (HasShareds) {
8069 auto *ArgStructAlloca =
8070 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8071 assert(ArgStructAlloca &&
8072 "Unable to find the alloca instruction corresponding to arguments "
8073 "for extracted function");
8074 auto *ArgStructType =
8075 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8076 assert(ArgStructType && "Unable to find struct type corresponding to "
8077 "arguments for extracted function");
8078 SharedsSize =
8079 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8080 }
8081
8082 // Argument - `flags`
8083 // Task is tied iff (Flags & 1) == 1.
8084 // Task is untied iff (Flags & 1) == 0.
8085 // Task is final iff (Flags & 2) == 2.
8086 // Task is not final iff (Flags & 2) == 0.
8087 // A target task is not final and is untied.
8088 Value *Flags = Builder.getInt32(0);
8089
8090 // Emit the @__kmpc_omp_task_alloc runtime call
8091 // The runtime call returns a pointer to an area where the task captured
8092 // variables must be copied before the task is run (TaskData)
8093 CallInst *TaskData = nullptr;
8094
8095 SmallVector<llvm::Value *> TaskAllocArgs = {
8096 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8097 /*flags=*/Flags,
8098 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8099 /*task_func=*/ProxyFn};
8100
8101 if (NeedsTargetTask) {
8102 assert(DeviceID && "Expected non-empty device ID.");
8103 TaskAllocArgs.push_back(DeviceID);
8104 }
8105
8106 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8107
8108 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8109 if (HasShareds) {
8110 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8112 *this, Builder, TaskData, TaskWithPrivatesTy);
8113 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8114 SharedsSize);
8115 }
8116 if (!OffloadingArraysToPrivatize.empty()) {
8117 Value *Privates =
8118 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8119 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8120 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8121 [[maybe_unused]] Type *ArrayType =
8122 getOffloadingArrayType(PtrToPrivatize);
8123 assert(ArrayType && "ArrayType cannot be nullptr");
8124
8125 Type *ElementType = PrivatesTy->getElementType(i);
8126 assert(ElementType == ArrayType &&
8127 "ElementType should match ArrayType");
8128 (void)ArrayType;
8129
8130 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8131 Builder.CreateMemCpy(
8132 Dst, Alignment, PtrToPrivatize, Alignment,
8133 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8134 }
8135 }
8136
8137 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8138
8139 // ---------------------------------------------------------------
8140 // V5.2 13.8 target construct
8141 // If the nowait clause is present, execution of the target task
8142 // may be deferred. If the nowait clause is not present, the target task is
8143 // an included task.
8144 // ---------------------------------------------------------------
8145 // The above means that the lack of a nowait on the target construct
8146 // translates to '#pragma omp task if(0)'
8147 if (!NeedsTargetTask) {
8148 if (DepArray) {
8149 Function *TaskWaitFn =
8150 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8151 Builder.CreateCall(
8152 TaskWaitFn,
8153 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8154 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8155 /*dep_list=*/DepArray,
8156 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8157 /*noalias_dep_list=*/
8159 }
8160 // Included task.
8161 Function *TaskBeginFn =
8162 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8163 Function *TaskCompleteFn =
8164 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8165 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8166 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8167 CI->setDebugLoc(StaleCI->getDebugLoc());
8168 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8169 } else if (DepArray) {
8170 // HasNoWait - meaning the task may be deferred. Call
8171 // __kmpc_omp_task_with_deps if there are dependencies,
8172 // else call __kmpc_omp_task
8173 Function *TaskFn =
8174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8175 Builder.CreateCall(
8176 TaskFn,
8177 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8178 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8180 } else {
8181 // Emit the @__kmpc_omp_task runtime call to spawn the task
8182 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8183 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8184 }
8185
8186 StaleCI->eraseFromParent();
8187 for (Instruction *I : llvm::reverse(ToBeDeleted))
8188 I->eraseFromParent();
8189 };
8190 addOutlineInfo(std::move(OI));
8191
8192 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8193 << *(Builder.GetInsertBlock()) << "\n");
8194 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8195 << *(Builder.GetInsertBlock()->getParent()->getParent())
8196 << "\n");
8197 return Builder.saveIP();
8198}
8199
8200Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8201 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8202 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8203 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8204 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8205 if (Error Err =
8206 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8207 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8208 return Err;
8209 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8210 return Error::success();
8211}
8212
8213static void emitTargetCall(
8214 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8215 OpenMPIRBuilder::InsertPointTy AllocaIP,
8216 OpenMPIRBuilder::TargetDataInfo &Info,
8217 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8218 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8219 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8221 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8222 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8224 bool HasNoWait) {
8225 // Generate a function call to the host fallback implementation of the target
8226 // region. This is called by the host when no offload entry was generated for
8227 // the target region and when the offloading call fails at runtime.
8228 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8229 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8230 Builder.restoreIP(IP);
8231 Builder.CreateCall(OutlinedFn, Args);
8232 return Builder.saveIP();
8233 };
8234
8235 bool HasDependencies = Dependencies.size() > 0;
8236 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8237
8238 OpenMPIRBuilder::TargetKernelArgs KArgs;
8239
8240 auto TaskBodyCB =
8241 [&](Value *DeviceID, Value *RTLoc,
8242 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8243 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8244 // produce any.
8245 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8246 // emitKernelLaunch makes the necessary runtime call to offload the
8247 // kernel. We then outline all that code into a separate function
8248 // ('kernel_launch_function' in the pseudo code above). This function is
8249 // then called by the target task proxy function (see
8250 // '@.omp_target_task_proxy_func' in the pseudo code above)
8251 // "@.omp_target_task_proxy_func' is generated by
8252 // emitTargetTaskProxyFunction.
8253 if (OutlinedFnID && DeviceID)
8254 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8255 EmitTargetCallFallbackCB, KArgs,
8256 DeviceID, RTLoc, TargetTaskAllocaIP);
8257
8258 // We only need to do the outlining if `DeviceID` is set to avoid calling
8259 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8260 // generating the `else` branch of an `if` clause.
8261 //
8262 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8263 // In this case, we execute the host implementation directly.
8264 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8265 }());
8266
8267 OMPBuilder.Builder.restoreIP(AfterIP);
8268 return Error::success();
8269 };
8270
8271 auto &&EmitTargetCallElse =
8272 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8273 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8274 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8275 // produce any.
8276 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8277 if (RequiresOuterTargetTask) {
8278 // Arguments that are intended to be directly forwarded to an
8279 // emitKernelLaunch call are pased as nullptr, since
8280 // OutlinedFnID=nullptr results in that call not being done.
8281 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8282 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8283 /*RTLoc=*/nullptr, AllocaIP,
8284 Dependencies, EmptyRTArgs, HasNoWait);
8285 }
8286 return EmitTargetCallFallbackCB(Builder.saveIP());
8287 }());
8288
8289 Builder.restoreIP(AfterIP);
8290 return Error::success();
8291 };
8292
8293 auto &&EmitTargetCallThen =
8294 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8295 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8296 Info.HasNoWait = HasNoWait;
8297 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8298 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8299 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8300 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8301 /*IsNonContiguous=*/true,
8302 /*ForEndCall=*/false))
8303 return Err;
8304
8305 SmallVector<Value *, 3> NumTeamsC;
8306 for (auto [DefaultVal, RuntimeVal] :
8307 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8308 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8309 : Builder.getInt32(DefaultVal));
8310
8311 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8312 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8313 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8314 if (Clause)
8315 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8316 /*isSigned=*/false);
8317 return Clause;
8318 };
8319 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8320 if (Clause)
8321 Result =
8322 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8323 Result, Clause)
8324 : Clause;
8325 };
8326
8327 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8328 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8329 SmallVector<Value *, 3> NumThreadsC;
8330 Value *MaxThreadsClause =
8331 RuntimeAttrs.TeamsThreadLimit.size() == 1
8332 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8333 : nullptr;
8334
8335 for (auto [TeamsVal, TargetVal] : zip_equal(
8336 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8337 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8338 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8339
8340 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8341 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8342
8343 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8344 }
8345
8346 unsigned NumTargetItems = Info.NumberOfPtrs;
8347 // TODO: Use correct device ID
8348 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8349 uint32_t SrcLocStrSize;
8350 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8351 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8352 llvm::omp::IdentFlag(0), 0);
8353
8354 Value *TripCount = RuntimeAttrs.LoopTripCount
8355 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8356 Builder.getInt64Ty(),
8357 /*isSigned=*/false)
8358 : Builder.getInt64(0);
8359
8360 // TODO: Use correct DynCGGroupMem
8361 Value *DynCGGroupMem = Builder.getInt32(0);
8362
8363 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8364 NumTeamsC, NumThreadsC,
8365 DynCGGroupMem, HasNoWait);
8366
8367 // Assume no error was returned because TaskBodyCB and
8368 // EmitTargetCallFallbackCB don't produce any.
8369 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8370 // The presence of certain clauses on the target directive require the
8371 // explicit generation of the target task.
8372 if (RequiresOuterTargetTask)
8373 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8374 Dependencies, KArgs.RTArgs,
8375 Info.HasNoWait);
8376
8377 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8378 EmitTargetCallFallbackCB, KArgs,
8379 DeviceID, RTLoc, AllocaIP);
8380 }());
8381
8382 Builder.restoreIP(AfterIP);
8383 return Error::success();
8384 };
8385
8386 // If we don't have an ID for the target region, it means an offload entry
8387 // wasn't created. In this case we just run the host fallback directly and
8388 // ignore any potential 'if' clauses.
8389 if (!OutlinedFnID) {
8390 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8391 return;
8392 }
8393
8394 // If there's no 'if' clause, only generate the kernel launch code path.
8395 if (!IfCond) {
8396 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8397 return;
8398 }
8399
8400 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8401 EmitTargetCallElse, AllocaIP));
8402}
8403
8404OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8405 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8406 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8407 TargetRegionEntryInfo &EntryInfo,
8408 const TargetKernelDefaultAttrs &DefaultAttrs,
8409 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8410 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8411 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8412 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8413 CustomMapperCallbackTy CustomMapperCB,
8414 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8415
8416 if (!updateToLocation(Loc))
8417 return InsertPointTy();
8418
8419 Builder.restoreIP(CodeGenIP);
8420
8421 Function *OutlinedFn;
8422 Constant *OutlinedFnID = nullptr;
8423 // The target region is outlined into its own function. The LLVM IR for
8424 // the target region itself is generated using the callbacks CBFunc
8425 // and ArgAccessorFuncCB
8427 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8428 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8429 return Err;
8430
8431 // If we are not on the target device, then we need to generate code
8432 // to make a remote call (offload) to the previously outlined function
8433 // that represents the target region. Do that now.
8434 if (!Config.isTargetDevice())
8435 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8436 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8437 CustomMapperCB, Dependencies, HasNowait);
8438 return Builder.saveIP();
8439}
8440
8441std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8442 StringRef FirstSeparator,
8443 StringRef Separator) {
8444 SmallString<128> Buffer;
8445 llvm::raw_svector_ostream OS(Buffer);
8446 StringRef Sep = FirstSeparator;
8447 for (StringRef Part : Parts) {
8448 OS << Sep << Part;
8449 Sep = Separator;
8450 }
8451 return OS.str().str();
8452}
8453
8454std::string
8455OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8456 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8457 Config.separator());
8458}
8459
8461OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8462 unsigned AddressSpace) {
8463 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8464 if (Elem.second) {
8465 assert(Elem.second->getValueType() == Ty &&
8466 "OMP internal variable has different type than requested");
8467 } else {
8468 // TODO: investigate the appropriate linkage type used for the global
8469 // variable for possibly changing that to internal or private, or maybe
8470 // create different versions of the function for different OMP internal
8471 // variables.
8472 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8475 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8476 Constant::getNullValue(Ty), Elem.first(),
8477 /*InsertBefore=*/nullptr,
8479 const DataLayout &DL = M.getDataLayout();
8480 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8481 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8482 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8483 Elem.second = GV;
8484 }
8485
8486 return Elem.second;
8487}
8488
8489Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8490 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8491 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8492 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8493}
8494
8495Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8496 LLVMContext &Ctx = Builder.getContext();
8497 Value *Null =
8499 Value *SizeGep =
8500 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8501 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8502 return SizePtrToInt;
8503}
8504
8506OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8507 std::string VarName) {
8508 llvm::Constant *MaptypesArrayInit =
8509 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8510 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8511 M, MaptypesArrayInit->getType(),
8512 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8513 VarName);
8514 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8515 return MaptypesArrayGlobal;
8516}
8517
8518void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8519 InsertPointTy AllocaIP,
8520 unsigned NumOperands,
8521 struct MapperAllocas &MapperAllocas) {
8522 if (!updateToLocation(Loc))
8523 return;
8524
8525 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8526 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8527 Builder.restoreIP(AllocaIP);
8528 AllocaInst *ArgsBase = Builder.CreateAlloca(
8529 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8530 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8531 ".offload_ptrs");
8532 AllocaInst *ArgSizes = Builder.CreateAlloca(
8533 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8534 updateToLocation(Loc);
8535 MapperAllocas.ArgsBase = ArgsBase;
8536 MapperAllocas.Args = Args;
8537 MapperAllocas.ArgSizes = ArgSizes;
8538}
8539
8540void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8541 Function *MapperFunc, Value *SrcLocInfo,
8542 Value *MaptypesArg, Value *MapnamesArg,
8543 struct MapperAllocas &MapperAllocas,
8544 int64_t DeviceID, unsigned NumOperands) {
8545 if (!updateToLocation(Loc))
8546 return;
8547
8548 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8549 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8550 Value *ArgsBaseGEP =
8551 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8552 {Builder.getInt32(0), Builder.getInt32(0)});
8553 Value *ArgsGEP =
8554 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8555 {Builder.getInt32(0), Builder.getInt32(0)});
8556 Value *ArgSizesGEP =
8557 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8558 {Builder.getInt32(0), Builder.getInt32(0)});
8559 Value *NullPtr =
8560 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8561 Builder.CreateCall(MapperFunc,
8562 {SrcLocInfo, Builder.getInt64(DeviceID),
8563 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8564 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8565}
8566
8567void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8568 TargetDataRTArgs &RTArgs,
8569 TargetDataInfo &Info,
8570 bool ForEndCall) {
8571 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8572 "expected region end call to runtime only when end call is separate");
8573 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8574 auto VoidPtrTy = UnqualPtrTy;
8575 auto VoidPtrPtrTy = UnqualPtrTy;
8576 auto Int64Ty = Type::getInt64Ty(M.getContext());
8577 auto Int64PtrTy = UnqualPtrTy;
8578
8579 if (!Info.NumberOfPtrs) {
8580 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8581 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8582 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8583 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8584 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8585 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8586 return;
8587 }
8588
8589 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8590 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8591 Info.RTArgs.BasePointersArray,
8592 /*Idx0=*/0, /*Idx1=*/0);
8593 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8594 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8595 /*Idx0=*/0,
8596 /*Idx1=*/0);
8597 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8598 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8599 /*Idx0=*/0, /*Idx1=*/0);
8600 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8601 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8602 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8603 : Info.RTArgs.MapTypesArray,
8604 /*Idx0=*/0,
8605 /*Idx1=*/0);
8606
8607 // Only emit the mapper information arrays if debug information is
8608 // requested.
8609 if (!Info.EmitDebug)
8610 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8611 else
8612 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8613 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8614 /*Idx0=*/0,
8615 /*Idx1=*/0);
8616 // If there is no user-defined mapper, set the mapper array to nullptr to
8617 // avoid an unnecessary data privatization
8618 if (!Info.HasMapper)
8619 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8620 else
8621 RTArgs.MappersArray =
8622 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8623}
8624
8625void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8626 InsertPointTy CodeGenIP,
8627 MapInfosTy &CombinedInfo,
8628 TargetDataInfo &Info) {
8629 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8630 CombinedInfo.NonContigInfo;
8631
8632 // Build an array of struct descriptor_dim and then assign it to
8633 // offload_args.
8634 //
8635 // struct descriptor_dim {
8636 // uint64_t offset;
8637 // uint64_t count;
8638 // uint64_t stride
8639 // };
8640 Type *Int64Ty = Builder.getInt64Ty();
8642 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8643 "struct.descriptor_dim");
8644
8645 enum { OffsetFD = 0, CountFD, StrideFD };
8646 // We need two index variable here since the size of "Dims" is the same as
8647 // the size of Components, however, the size of offset, count, and stride is
8648 // equal to the size of base declaration that is non-contiguous.
8649 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8650 // Skip emitting ir if dimension size is 1 since it cannot be
8651 // non-contiguous.
8652 if (NonContigInfo.Dims[I] == 1)
8653 continue;
8654 Builder.restoreIP(AllocaIP);
8655 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8656 AllocaInst *DimsAddr =
8657 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8658 Builder.restoreIP(CodeGenIP);
8659 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8660 unsigned RevIdx = EE - II - 1;
8661 Value *DimsLVal = Builder.CreateInBoundsGEP(
8662 DimsAddr->getAllocatedType(), DimsAddr,
8663 {Builder.getInt64(0), Builder.getInt64(II)});
8664 // Offset
8665 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8666 Builder.CreateAlignedStore(
8667 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8668 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8669 // Count
8670 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8671 Builder.CreateAlignedStore(
8672 NonContigInfo.Counts[L][RevIdx], CountLVal,
8673 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8674 // Stride
8675 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8676 Builder.CreateAlignedStore(
8677 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8678 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8679 }
8680 // args[I] = &dims
8681 Builder.restoreIP(CodeGenIP);
8682 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8683 DimsAddr, Builder.getPtrTy());
8684 Value *P = Builder.CreateConstInBoundsGEP2_32(
8685 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8686 Info.RTArgs.PointersArray, 0, I);
8687 Builder.CreateAlignedStore(
8688 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8689 ++L;
8690 }
8691}
8692
8693void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8694 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8695 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8696 BasicBlock *ExitBB, bool IsInit) {
8697 StringRef Prefix = IsInit ? ".init" : ".del";
8698
8699 // Evaluate if this is an array section.
8701 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8702 Value *IsArray =
8703 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8704 Value *DeleteBit = Builder.CreateAnd(
8705 MapType,
8706 Builder.getInt64(
8707 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8708 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8709 Value *DeleteCond;
8710 Value *Cond;
8711 if (IsInit) {
8712 // base != begin?
8713 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8714 // IsPtrAndObj?
8715 Value *PtrAndObjBit = Builder.CreateAnd(
8716 MapType,
8717 Builder.getInt64(
8718 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8719 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8720 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8721 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8722 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8723 DeleteCond = Builder.CreateIsNull(
8724 DeleteBit,
8725 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8726 } else {
8727 Cond = IsArray;
8728 DeleteCond = Builder.CreateIsNotNull(
8729 DeleteBit,
8730 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8731 }
8732 Cond = Builder.CreateAnd(Cond, DeleteCond);
8733 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8734
8735 emitBlock(BodyBB, MapperFn);
8736 // Get the array size by multiplying element size and element number (i.e., \p
8737 // Size).
8738 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8739 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8740 // memory allocation/deletion purpose only.
8741 Value *MapTypeArg = Builder.CreateAnd(
8742 MapType,
8743 Builder.getInt64(
8744 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8745 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8746 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8747 MapTypeArg = Builder.CreateOr(
8748 MapTypeArg,
8749 Builder.getInt64(
8750 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8751 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8752
8753 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8754 // data structure.
8755 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8756 ArraySize, MapTypeArg, MapName};
8757 Builder.CreateCall(
8758 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8759 OffloadingArgs);
8760}
8761
8762Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8763 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8764 llvm::Value *BeginArg)>
8765 GenMapInfoCB,
8766 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8767 SmallVector<Type *> Params;
8768 Params.emplace_back(Builder.getPtrTy());
8769 Params.emplace_back(Builder.getPtrTy());
8770 Params.emplace_back(Builder.getPtrTy());
8771 Params.emplace_back(Builder.getInt64Ty());
8772 Params.emplace_back(Builder.getInt64Ty());
8773 Params.emplace_back(Builder.getPtrTy());
8774
8775 auto *FnTy =
8776 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8777
8778 SmallString<64> TyStr;
8779 raw_svector_ostream Out(TyStr);
8780 Function *MapperFn =
8782 MapperFn->addFnAttr(Attribute::NoInline);
8783 MapperFn->addFnAttr(Attribute::NoUnwind);
8784 MapperFn->addParamAttr(0, Attribute::NoUndef);
8785 MapperFn->addParamAttr(1, Attribute::NoUndef);
8786 MapperFn->addParamAttr(2, Attribute::NoUndef);
8787 MapperFn->addParamAttr(3, Attribute::NoUndef);
8788 MapperFn->addParamAttr(4, Attribute::NoUndef);
8789 MapperFn->addParamAttr(5, Attribute::NoUndef);
8790
8791 // Start the mapper function code generation.
8792 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8793 auto SavedIP = Builder.saveIP();
8794 Builder.SetInsertPoint(EntryBB);
8795
8796 Value *MapperHandle = MapperFn->getArg(0);
8797 Value *BaseIn = MapperFn->getArg(1);
8798 Value *BeginIn = MapperFn->getArg(2);
8799 Value *Size = MapperFn->getArg(3);
8800 Value *MapType = MapperFn->getArg(4);
8801 Value *MapName = MapperFn->getArg(5);
8802
8803 // Compute the starting and end addresses of array elements.
8804 // Prepare common arguments for array initiation and deletion.
8805 // Convert the size in bytes into the number of array elements.
8806 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8807 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8808 Value *PtrBegin = BeginIn;
8809 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8810
8811 // Emit array initiation if this is an array section and \p MapType indicates
8812 // that memory allocation is required.
8813 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8814 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8815 MapType, MapName, ElementSize, HeadBB,
8816 /*IsInit=*/true);
8817
8818 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8819
8820 // Emit the loop header block.
8821 emitBlock(HeadBB, MapperFn);
8822 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8823 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8824 // Evaluate whether the initial condition is satisfied.
8825 Value *IsEmpty =
8826 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8827 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8828
8829 // Emit the loop body block.
8830 emitBlock(BodyBB, MapperFn);
8831 BasicBlock *LastBB = BodyBB;
8832 PHINode *PtrPHI =
8833 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8834 PtrPHI->addIncoming(PtrBegin, HeadBB);
8835
8836 // Get map clause information. Fill up the arrays with all mapped variables.
8837 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8838 if (!Info)
8839 return Info.takeError();
8840
8841 // Call the runtime API __tgt_mapper_num_components to get the number of
8842 // pre-existing components.
8843 Value *OffloadingArgs[] = {MapperHandle};
8844 Value *PreviousSize = Builder.CreateCall(
8845 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8846 OffloadingArgs);
8847 Value *ShiftedPreviousSize =
8848 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8849
8850 // Fill up the runtime mapper handle for all components.
8851 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8852 Value *CurBaseArg = Info->BasePointers[I];
8853 Value *CurBeginArg = Info->Pointers[I];
8854 Value *CurSizeArg = Info->Sizes[I];
8855 Value *CurNameArg = Info->Names.size()
8856 ? Info->Names[I]
8857 : Constant::getNullValue(Builder.getPtrTy());
8858
8859 // Extract the MEMBER_OF field from the map type.
8860 Value *OriMapType = Builder.getInt64(
8861 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8862 Info->Types[I]));
8863 Value *MemberMapType =
8864 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8865
8866 // Combine the map type inherited from user-defined mapper with that
8867 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8868 // bits of the \a MapType, which is the input argument of the mapper
8869 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8870 // bits of MemberMapType.
8871 // [OpenMP 5.0], 1.2.6. map-type decay.
8872 // | alloc | to | from | tofrom | release | delete
8873 // ----------------------------------------------------------
8874 // alloc | alloc | alloc | alloc | alloc | release | delete
8875 // to | alloc | to | alloc | to | release | delete
8876 // from | alloc | alloc | from | from | release | delete
8877 // tofrom | alloc | to | from | tofrom | release | delete
8878 Value *LeftToFrom = Builder.CreateAnd(
8879 MapType,
8880 Builder.getInt64(
8881 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8882 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8883 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8884 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8885 BasicBlock *AllocElseBB =
8886 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8887 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8888 BasicBlock *ToElseBB =
8889 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8890 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8891 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8892 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8893 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8894 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8895 emitBlock(AllocBB, MapperFn);
8896 Value *AllocMapType = Builder.CreateAnd(
8897 MemberMapType,
8898 Builder.getInt64(
8899 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8900 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8901 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8902 Builder.CreateBr(EndBB);
8903 emitBlock(AllocElseBB, MapperFn);
8904 Value *IsTo = Builder.CreateICmpEQ(
8905 LeftToFrom,
8906 Builder.getInt64(
8907 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8908 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8909 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8910 // In case of to, clear OMP_MAP_FROM.
8911 emitBlock(ToBB, MapperFn);
8912 Value *ToMapType = Builder.CreateAnd(
8913 MemberMapType,
8914 Builder.getInt64(
8915 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8916 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8917 Builder.CreateBr(EndBB);
8918 emitBlock(ToElseBB, MapperFn);
8919 Value *IsFrom = Builder.CreateICmpEQ(
8920 LeftToFrom,
8921 Builder.getInt64(
8922 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8923 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8924 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8925 // In case of from, clear OMP_MAP_TO.
8926 emitBlock(FromBB, MapperFn);
8927 Value *FromMapType = Builder.CreateAnd(
8928 MemberMapType,
8929 Builder.getInt64(
8930 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8931 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8932 // In case of tofrom, do nothing.
8933 emitBlock(EndBB, MapperFn);
8934 LastBB = EndBB;
8935 PHINode *CurMapType =
8936 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8937 CurMapType->addIncoming(AllocMapType, AllocBB);
8938 CurMapType->addIncoming(ToMapType, ToBB);
8939 CurMapType->addIncoming(FromMapType, FromBB);
8940 CurMapType->addIncoming(MemberMapType, ToElseBB);
8941
8942 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8943 CurSizeArg, CurMapType, CurNameArg};
8944
8945 auto ChildMapperFn = CustomMapperCB(I);
8946 if (!ChildMapperFn)
8947 return ChildMapperFn.takeError();
8948 if (*ChildMapperFn) {
8949 // Call the corresponding mapper function.
8950 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8951 } else {
8952 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8953 // data structure.
8954 Builder.CreateCall(
8955 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8956 OffloadingArgs);
8957 }
8958 }
8959
8960 // Update the pointer to point to the next element that needs to be mapped,
8961 // and check whether we have mapped all elements.
8962 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8963 "omp.arraymap.next");
8964 PtrPHI->addIncoming(PtrNext, LastBB);
8965 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8966 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8967 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8968
8969 emitBlock(ExitBB, MapperFn);
8970 // Emit array deletion if this is an array section and \p MapType indicates
8971 // that deletion is required.
8972 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8973 MapType, MapName, ElementSize, DoneBB,
8974 /*IsInit=*/false);
8975
8976 // Emit the function exit block.
8977 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8978
8979 Builder.CreateRetVoid();
8980 Builder.restoreIP(SavedIP);
8981 return MapperFn;
8982}
8983
8984Error OpenMPIRBuilder::emitOffloadingArrays(
8985 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8986 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8987 bool IsNonContiguous,
8988 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8989
8990 // Reset the array information.
8991 Info.clearArrayInfo();
8992 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8993
8994 if (Info.NumberOfPtrs == 0)
8995 return Error::success();
8996
8997 Builder.restoreIP(AllocaIP);
8998 // Detect if we have any capture size requiring runtime evaluation of the
8999 // size so that a constant array could be eventually used.
9000 ArrayType *PointerArrayType =
9001 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9002
9003 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9004 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9005
9006 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9007 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9008 AllocaInst *MappersArray = Builder.CreateAlloca(
9009 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9010 Info.RTArgs.MappersArray = MappersArray;
9011
9012 // If we don't have any VLA types or other types that require runtime
9013 // evaluation, we can use a constant array for the map sizes, otherwise we
9014 // need to fill up the arrays as we do for the pointers.
9015 Type *Int64Ty = Builder.getInt64Ty();
9016 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9017 ConstantInt::get(Int64Ty, 0));
9018 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9019 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9020 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9021 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9022 if (IsNonContiguous &&
9023 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9024 CombinedInfo.Types[I] &
9025 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9026 ConstSizes[I] =
9027 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9028 else
9029 ConstSizes[I] = CI;
9030 continue;
9031 }
9032 }
9033 RuntimeSizes.set(I);
9034 }
9035
9036 if (RuntimeSizes.all()) {
9037 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9038 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9039 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9040 restoreIPandDebugLoc(Builder, CodeGenIP);
9041 } else {
9042 auto *SizesArrayInit = ConstantArray::get(
9043 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9044 std::string Name = createPlatformSpecificName({"offload_sizes"});
9045 auto *SizesArrayGbl =
9046 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9047 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9048 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9049
9050 if (!RuntimeSizes.any()) {
9051 Info.RTArgs.SizesArray = SizesArrayGbl;
9052 } else {
9053 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9054 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9055 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9056 AllocaInst *Buffer = Builder.CreateAlloca(
9057 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9058 Buffer->setAlignment(OffloadSizeAlign);
9059 restoreIPandDebugLoc(Builder, CodeGenIP);
9060 Builder.CreateMemCpy(
9061 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9062 SizesArrayGbl, OffloadSizeAlign,
9063 Builder.getIntN(
9064 IndexSize,
9065 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9066
9067 Info.RTArgs.SizesArray = Buffer;
9068 }
9069 restoreIPandDebugLoc(Builder, CodeGenIP);
9070 }
9071
9072 // The map types are always constant so we don't need to generate code to
9073 // fill arrays. Instead, we create an array constant.
9075 for (auto mapFlag : CombinedInfo.Types)
9076 Mapping.push_back(
9077 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9078 mapFlag));
9079 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9080 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9081 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9082
9083 // The information types are only built if provided.
9084 if (!CombinedInfo.Names.empty()) {
9085 auto *MapNamesArrayGbl = createOffloadMapnames(
9086 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9087 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9088 Info.EmitDebug = true;
9089 } else {
9090 Info.RTArgs.MapNamesArray =
9091 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9092 Info.EmitDebug = false;
9093 }
9094
9095 // If there's a present map type modifier, it must not be applied to the end
9096 // of a region, so generate a separate map type array in that case.
9097 if (Info.separateBeginEndCalls()) {
9098 bool EndMapTypesDiffer = false;
9099 for (uint64_t &Type : Mapping) {
9100 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9101 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9102 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9103 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9104 EndMapTypesDiffer = true;
9105 }
9106 }
9107 if (EndMapTypesDiffer) {
9108 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9109 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9110 }
9111 }
9112
9113 PointerType *PtrTy = Builder.getPtrTy();
9114 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9115 Value *BPVal = CombinedInfo.BasePointers[I];
9116 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9117 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9118 0, I);
9119 Builder.CreateAlignedStore(BPVal, BP,
9120 M.getDataLayout().getPrefTypeAlign(PtrTy));
9121
9122 if (Info.requiresDevicePointerInfo()) {
9123 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9124 CodeGenIP = Builder.saveIP();
9125 Builder.restoreIP(AllocaIP);
9126 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9127 Builder.restoreIP(CodeGenIP);
9128 if (DeviceAddrCB)
9129 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9130 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9131 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9132 if (DeviceAddrCB)
9133 DeviceAddrCB(I, BP);
9134 }
9135 }
9136
9137 Value *PVal = CombinedInfo.Pointers[I];
9138 Value *P = Builder.CreateConstInBoundsGEP2_32(
9139 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9140 I);
9141 // TODO: Check alignment correct.
9142 Builder.CreateAlignedStore(PVal, P,
9143 M.getDataLayout().getPrefTypeAlign(PtrTy));
9144
9145 if (RuntimeSizes.test(I)) {
9146 Value *S = Builder.CreateConstInBoundsGEP2_32(
9147 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9148 /*Idx0=*/0,
9149 /*Idx1=*/I);
9150 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9151 Int64Ty,
9152 /*isSigned=*/true),
9153 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9154 }
9155 // Fill up the mapper array.
9156 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9157 Value *MFunc = ConstantPointerNull::get(PtrTy);
9158
9159 auto CustomMFunc = CustomMapperCB(I);
9160 if (!CustomMFunc)
9161 return CustomMFunc.takeError();
9162 if (*CustomMFunc)
9163 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9164
9165 Value *MAddr = Builder.CreateInBoundsGEP(
9166 MappersArray->getAllocatedType(), MappersArray,
9167 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9168 Builder.CreateAlignedStore(
9169 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9170 }
9171
9172 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9173 Info.NumberOfPtrs == 0)
9174 return Error::success();
9175 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9176 return Error::success();
9177}
9178
9179void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9180 BasicBlock *CurBB = Builder.GetInsertBlock();
9181
9182 if (!CurBB || CurBB->getTerminator()) {
9183 // If there is no insert point or the previous block is already
9184 // terminated, don't touch it.
9185 } else {
9186 // Otherwise, create a fall-through branch.
9187 Builder.CreateBr(Target);
9188 }
9189
9190 Builder.ClearInsertionPoint();
9191}
9192
9193void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9194 bool IsFinished) {
9195 BasicBlock *CurBB = Builder.GetInsertBlock();
9196
9197 // Fall out of the current block (if necessary).
9198 emitBranch(BB);
9199
9200 if (IsFinished && BB->use_empty()) {
9201 BB->eraseFromParent();
9202 return;
9203 }
9204
9205 // Place the block after the current block, if possible, or else at
9206 // the end of the function.
9207 if (CurBB && CurBB->getParent())
9208 CurFn->insert(std::next(CurBB->getIterator()), BB);
9209 else
9210 CurFn->insert(CurFn->end(), BB);
9211 Builder.SetInsertPoint(BB);
9212}
9213
9214Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9215 BodyGenCallbackTy ElseGen,
9216 InsertPointTy AllocaIP) {
9217 // If the condition constant folds and can be elided, try to avoid emitting
9218 // the condition and the dead arm of the if/else.
9219 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9220 auto CondConstant = CI->getSExtValue();
9221 if (CondConstant)
9222 return ThenGen(AllocaIP, Builder.saveIP());
9223
9224 return ElseGen(AllocaIP, Builder.saveIP());
9225 }
9226
9227 Function *CurFn = Builder.GetInsertBlock()->getParent();
9228
9229 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9230 // emit the conditional branch.
9231 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9232 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9233 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9234 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9235 // Emit the 'then' code.
9236 emitBlock(ThenBlock, CurFn);
9237 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9238 return Err;
9239 emitBranch(ContBlock);
9240 // Emit the 'else' code if present.
9241 // There is no need to emit line number for unconditional branch.
9242 emitBlock(ElseBlock, CurFn);
9243 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9244 return Err;
9245 // There is no need to emit line number for unconditional branch.
9246 emitBranch(ContBlock);
9247 // Emit the continuation block for code after the if.
9248 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9249 return Error::success();
9250}
9251
9252bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9253 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9256 "Unexpected Atomic Ordering.");
9257
9258 bool Flush = false;
9260
9261 switch (AK) {
9262 case Read:
9265 FlushAO = AtomicOrdering::Acquire;
9266 Flush = true;
9267 }
9268 break;
9269 case Write:
9270 case Compare:
9271 case Update:
9274 FlushAO = AtomicOrdering::Release;
9275 Flush = true;
9276 }
9277 break;
9278 case Capture:
9279 switch (AO) {
9281 FlushAO = AtomicOrdering::Acquire;
9282 Flush = true;
9283 break;
9285 FlushAO = AtomicOrdering::Release;
9286 Flush = true;
9287 break;
9291 Flush = true;
9292 break;
9293 default:
9294 // do nothing - leave silently.
9295 break;
9296 }
9297 }
9298
9299 if (Flush) {
9300 // Currently Flush RT call still doesn't take memory_ordering, so for when
9301 // that happens, this tries to do the resolution of which atomic ordering
9302 // to use with but issue the flush call
9303 // TODO: pass `FlushAO` after memory ordering support is added
9304 (void)FlushAO;
9305 emitFlush(Loc);
9306 }
9307
9308 // for AO == AtomicOrdering::Monotonic and all other case combinations
9309 // do nothing
9310 return Flush;
9311}
9312
9313OpenMPIRBuilder::InsertPointTy
9314OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9315 AtomicOpValue &X, AtomicOpValue &V,
9316 AtomicOrdering AO, InsertPointTy AllocaIP) {
9317 if (!updateToLocation(Loc))
9318 return Loc.IP;
9319
9320 assert(X.Var->getType()->isPointerTy() &&
9321 "OMP Atomic expects a pointer to target memory");
9322 Type *XElemTy = X.ElemTy;
9323 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9324 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9325 "OMP atomic read expected a scalar type");
9326
9327 Value *XRead = nullptr;
9328
9329 if (XElemTy->isIntegerTy()) {
9330 LoadInst *XLD =
9331 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9332 XLD->setAtomic(AO);
9333 XRead = cast<Value>(XLD);
9334 } else if (XElemTy->isStructTy()) {
9335 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9336 // target does not support `atomicrmw` of the size of the struct
9337 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9338 OldVal->setAtomic(AO);
9339 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9340 unsigned LoadSize =
9341 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9342 OpenMPIRBuilder::AtomicInfo atomicInfo(
9343 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9344 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9345 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9346 XRead = AtomicLoadRes.first;
9347 OldVal->eraseFromParent();
9348 } else {
9349 // We need to perform atomic op as integer
9350 IntegerType *IntCastTy =
9351 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9352 LoadInst *XLoad =
9353 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9354 XLoad->setAtomic(AO);
9355 if (XElemTy->isFloatingPointTy()) {
9356 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9357 } else {
9358 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9359 }
9360 }
9361 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9362 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9363 return Builder.saveIP();
9364}
9365
9366OpenMPIRBuilder::InsertPointTy
9367OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9368 AtomicOpValue &X, Value *Expr,
9369 AtomicOrdering AO, InsertPointTy AllocaIP) {
9370 if (!updateToLocation(Loc))
9371 return Loc.IP;
9372
9373 assert(X.Var->getType()->isPointerTy() &&
9374 "OMP Atomic expects a pointer to target memory");
9375 Type *XElemTy = X.ElemTy;
9376 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9377 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9378 "OMP atomic write expected a scalar type");
9379
9380 if (XElemTy->isIntegerTy()) {
9381 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9382 XSt->setAtomic(AO);
9383 } else if (XElemTy->isStructTy()) {
9384 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9385 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9386 unsigned LoadSize =
9387 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9388 OpenMPIRBuilder::AtomicInfo atomicInfo(
9389 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9390 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9391 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9392 OldVal->eraseFromParent();
9393 } else {
9394 // We need to bitcast and perform atomic op as integers
9395 IntegerType *IntCastTy =
9396 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9397 Value *ExprCast =
9398 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9399 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9400 XSt->setAtomic(AO);
9401 }
9402
9403 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9404 return Builder.saveIP();
9405}
9406
9407OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9408 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9409 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9410 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9411 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9412 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9413 if (!updateToLocation(Loc))
9414 return Loc.IP;
9415
9416 LLVM_DEBUG({
9417 Type *XTy = X.Var->getType();
9418 assert(XTy->isPointerTy() &&
9419 "OMP Atomic expects a pointer to target memory");
9420 Type *XElemTy = X.ElemTy;
9421 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9422 XElemTy->isPointerTy()) &&
9423 "OMP atomic update expected a scalar type");
9424 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9425 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9426 "OpenMP atomic does not support LT or GT operations");
9427 });
9428
9429 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9430 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9431 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9432 if (!AtomicResult)
9433 return AtomicResult.takeError();
9434 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9435 return Builder.saveIP();
9436}
9437
9438// FIXME: Duplicating AtomicExpand
9439Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9440 AtomicRMWInst::BinOp RMWOp) {
9441 switch (RMWOp) {
9442 case AtomicRMWInst::Add:
9443 return Builder.CreateAdd(Src1, Src2);
9444 case AtomicRMWInst::Sub:
9445 return Builder.CreateSub(Src1, Src2);
9446 case AtomicRMWInst::And:
9447 return Builder.CreateAnd(Src1, Src2);
9449 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9450 case AtomicRMWInst::Or:
9451 return Builder.CreateOr(Src1, Src2);
9452 case AtomicRMWInst::Xor:
9453 return Builder.CreateXor(Src1, Src2);
9458 case AtomicRMWInst::Max:
9459 case AtomicRMWInst::Min:
9470 llvm_unreachable("Unsupported atomic update operation");
9471 }
9472 llvm_unreachable("Unsupported atomic update operation");
9473}
9474
9475Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9476 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9478 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9479 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9480 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9481 // or a complex datatype.
9482 bool emitRMWOp = false;
9483 switch (RMWOp) {
9484 case AtomicRMWInst::Add:
9485 case AtomicRMWInst::And:
9487 case AtomicRMWInst::Or:
9488 case AtomicRMWInst::Xor:
9490 emitRMWOp = XElemTy;
9491 break;
9492 case AtomicRMWInst::Sub:
9493 emitRMWOp = (IsXBinopExpr && XElemTy);
9494 break;
9495 default:
9496 emitRMWOp = false;
9497 }
9498 emitRMWOp &= XElemTy->isIntegerTy();
9499
9500 std::pair<Value *, Value *> Res;
9501 if (emitRMWOp) {
9502 AtomicRMWInst *RMWInst =
9503 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9504 if (T.isAMDGPU()) {
9505 if (IsIgnoreDenormalMode)
9506 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9507 llvm::MDNode::get(Builder.getContext(), {}));
9508 if (!IsFineGrainedMemory)
9509 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9510 llvm::MDNode::get(Builder.getContext(), {}));
9511 if (!IsRemoteMemory)
9512 RMWInst->setMetadata("amdgpu.no.remote.memory",
9513 llvm::MDNode::get(Builder.getContext(), {}));
9514 }
9515 Res.first = RMWInst;
9516 // not needed except in case of postfix captures. Generate anyway for
9517 // consistency with the else part. Will be removed with any DCE pass.
9518 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9519 if (RMWOp == AtomicRMWInst::Xchg)
9520 Res.second = Res.first;
9521 else
9522 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9523 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9524 XElemTy->isStructTy()) {
9525 LoadInst *OldVal =
9526 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9527 OldVal->setAtomic(AO);
9528 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9529 unsigned LoadSize =
9530 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9531
9532 OpenMPIRBuilder::AtomicInfo atomicInfo(
9533 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9534 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9535 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9536 BasicBlock *CurBB = Builder.GetInsertBlock();
9537 Instruction *CurBBTI = CurBB->getTerminator();
9538 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9539 BasicBlock *ExitBB =
9540 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9541 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9542 X->getName() + ".atomic.cont");
9543 ContBB->getTerminator()->eraseFromParent();
9544 Builder.restoreIP(AllocaIP);
9545 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9546 NewAtomicAddr->setName(X->getName() + "x.new.val");
9547 Builder.SetInsertPoint(ContBB);
9548 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9549 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9550 Value *OldExprVal = PHI;
9551 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9552 if (!CBResult)
9553 return CBResult.takeError();
9554 Value *Upd = *CBResult;
9555 Builder.CreateStore(Upd, NewAtomicAddr);
9558 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9559 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9560 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9561 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9562 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9563 OldVal->eraseFromParent();
9564 Res.first = OldExprVal;
9565 Res.second = Upd;
9566
9567 if (UnreachableInst *ExitTI =
9569 CurBBTI->eraseFromParent();
9570 Builder.SetInsertPoint(ExitBB);
9571 } else {
9572 Builder.SetInsertPoint(ExitTI);
9573 }
9574 } else {
9575 IntegerType *IntCastTy =
9576 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9577 LoadInst *OldVal =
9578 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9579 OldVal->setAtomic(AO);
9580 // CurBB
9581 // | /---\
9582 // ContBB |
9583 // | \---/
9584 // ExitBB
9585 BasicBlock *CurBB = Builder.GetInsertBlock();
9586 Instruction *CurBBTI = CurBB->getTerminator();
9587 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9588 BasicBlock *ExitBB =
9589 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9590 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9591 X->getName() + ".atomic.cont");
9592 ContBB->getTerminator()->eraseFromParent();
9593 Builder.restoreIP(AllocaIP);
9594 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9595 NewAtomicAddr->setName(X->getName() + "x.new.val");
9596 Builder.SetInsertPoint(ContBB);
9597 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9598 PHI->addIncoming(OldVal, CurBB);
9599 bool IsIntTy = XElemTy->isIntegerTy();
9600 Value *OldExprVal = PHI;
9601 if (!IsIntTy) {
9602 if (XElemTy->isFloatingPointTy()) {
9603 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9604 X->getName() + ".atomic.fltCast");
9605 } else {
9606 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9607 X->getName() + ".atomic.ptrCast");
9608 }
9609 }
9610
9611 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9612 if (!CBResult)
9613 return CBResult.takeError();
9614 Value *Upd = *CBResult;
9615 Builder.CreateStore(Upd, NewAtomicAddr);
9616 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9619 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9620 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9621 Result->setVolatile(VolatileX);
9622 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9623 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9624 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9625 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9626
9627 Res.first = OldExprVal;
9628 Res.second = Upd;
9629
9630 // set Insertion point in exit block
9631 if (UnreachableInst *ExitTI =
9633 CurBBTI->eraseFromParent();
9634 Builder.SetInsertPoint(ExitBB);
9635 } else {
9636 Builder.SetInsertPoint(ExitTI);
9637 }
9638 }
9639
9640 return Res;
9641}
9642
9643OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9644 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9645 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9646 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9647 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9648 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9649 if (!updateToLocation(Loc))
9650 return Loc.IP;
9651
9652 LLVM_DEBUG({
9653 Type *XTy = X.Var->getType();
9654 assert(XTy->isPointerTy() &&
9655 "OMP Atomic expects a pointer to target memory");
9656 Type *XElemTy = X.ElemTy;
9657 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9658 XElemTy->isPointerTy()) &&
9659 "OMP atomic capture expected a scalar type");
9660 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9661 "OpenMP atomic does not support LT or GT operations");
9662 });
9663
9664 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9665 // 'x' is simply atomically rewritten with 'expr'.
9666 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9667 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9668 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9669 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9670 if (!AtomicResult)
9671 return AtomicResult.takeError();
9672 Value *CapturedVal =
9673 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9674 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9675
9676 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9677 return Builder.saveIP();
9678}
9679
9680OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9681 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9682 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9683 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9684 bool IsFailOnly) {
9685
9687 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9688 IsPostfixUpdate, IsFailOnly, Failure);
9689}
9690
9691OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9692 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9693 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9694 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9695 bool IsFailOnly, AtomicOrdering Failure) {
9696
9697 if (!updateToLocation(Loc))
9698 return Loc.IP;
9699
9700 assert(X.Var->getType()->isPointerTy() &&
9701 "OMP atomic expects a pointer to target memory");
9702 // compare capture
9703 if (V.Var) {
9704 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9705 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9706 }
9707
9708 bool IsInteger = E->getType()->isIntegerTy();
9709
9710 if (Op == OMPAtomicCompareOp::EQ) {
9711 AtomicCmpXchgInst *Result = nullptr;
9712 if (!IsInteger) {
9713 IntegerType *IntCastTy =
9714 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9715 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9716 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9717 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9718 AO, Failure);
9719 } else {
9720 Result =
9721 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9722 }
9723
9724 if (V.Var) {
9725 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9726 if (!IsInteger)
9727 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9728 assert(OldValue->getType() == V.ElemTy &&
9729 "OldValue and V must be of same type");
9730 if (IsPostfixUpdate) {
9731 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9732 } else {
9733 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9734 if (IsFailOnly) {
9735 // CurBB----
9736 // | |
9737 // v |
9738 // ContBB |
9739 // | |
9740 // v |
9741 // ExitBB <-
9742 //
9743 // where ContBB only contains the store of old value to 'v'.
9744 BasicBlock *CurBB = Builder.GetInsertBlock();
9745 Instruction *CurBBTI = CurBB->getTerminator();
9746 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9747 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9748 CurBBTI, X.Var->getName() + ".atomic.exit");
9749 BasicBlock *ContBB = CurBB->splitBasicBlock(
9750 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9751 ContBB->getTerminator()->eraseFromParent();
9752 CurBB->getTerminator()->eraseFromParent();
9753
9754 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9755
9756 Builder.SetInsertPoint(ContBB);
9757 Builder.CreateStore(OldValue, V.Var);
9758 Builder.CreateBr(ExitBB);
9759
9760 if (UnreachableInst *ExitTI =
9762 CurBBTI->eraseFromParent();
9763 Builder.SetInsertPoint(ExitBB);
9764 } else {
9765 Builder.SetInsertPoint(ExitTI);
9766 }
9767 } else {
9768 Value *CapturedValue =
9769 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9770 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9771 }
9772 }
9773 }
9774 // The comparison result has to be stored.
9775 if (R.Var) {
9776 assert(R.Var->getType()->isPointerTy() &&
9777 "r.var must be of pointer type");
9778 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9779
9780 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9781 Value *ResultCast = R.IsSigned
9782 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9783 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9784 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9785 }
9786 } else {
9787 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9788 "Op should be either max or min at this point");
9789 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9790
9791 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9792 // Let's take max as example.
9793 // OpenMP form:
9794 // x = x > expr ? expr : x;
9795 // LLVM form:
9796 // *ptr = *ptr > val ? *ptr : val;
9797 // We need to transform to LLVM form.
9798 // x = x <= expr ? x : expr;
9800 if (IsXBinopExpr) {
9801 if (IsInteger) {
9802 if (X.IsSigned)
9803 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9805 else
9806 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9808 } else {
9809 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9811 }
9812 } else {
9813 if (IsInteger) {
9814 if (X.IsSigned)
9815 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9817 else
9818 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9820 } else {
9821 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9823 }
9824 }
9825
9826 AtomicRMWInst *OldValue =
9827 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9828 if (V.Var) {
9829 Value *CapturedValue = nullptr;
9830 if (IsPostfixUpdate) {
9831 CapturedValue = OldValue;
9832 } else {
9833 CmpInst::Predicate Pred;
9834 switch (NewOp) {
9835 case AtomicRMWInst::Max:
9836 Pred = CmpInst::ICMP_SGT;
9837 break;
9839 Pred = CmpInst::ICMP_UGT;
9840 break;
9842 Pred = CmpInst::FCMP_OGT;
9843 break;
9844 case AtomicRMWInst::Min:
9845 Pred = CmpInst::ICMP_SLT;
9846 break;
9848 Pred = CmpInst::ICMP_ULT;
9849 break;
9851 Pred = CmpInst::FCMP_OLT;
9852 break;
9853 default:
9854 llvm_unreachable("unexpected comparison op");
9855 }
9856 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9857 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9858 }
9859 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9860 }
9861 }
9862
9863 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9864
9865 return Builder.saveIP();
9866}
9867
9868OpenMPIRBuilder::InsertPointOrErrorTy
9869OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9870 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9871 Value *NumTeamsUpper, Value *ThreadLimit,
9872 Value *IfExpr) {
9873 if (!updateToLocation(Loc))
9874 return InsertPointTy();
9875
9876 uint32_t SrcLocStrSize;
9877 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9878 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9879 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9880
9881 // Outer allocation basicblock is the entry block of the current function.
9882 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9883 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9884 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9885 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9886 }
9887
9888 // The current basic block is split into four basic blocks. After outlining,
9889 // they will be mapped as follows:
9890 // ```
9891 // def current_fn() {
9892 // current_basic_block:
9893 // br label %teams.exit
9894 // teams.exit:
9895 // ; instructions after teams
9896 // }
9897 //
9898 // def outlined_fn() {
9899 // teams.alloca:
9900 // br label %teams.body
9901 // teams.body:
9902 // ; instructions within teams body
9903 // }
9904 // ```
9905 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9906 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9907 BasicBlock *AllocaBB =
9908 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9909
9910 bool SubClausesPresent =
9911 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9912 // Push num_teams
9913 if (!Config.isTargetDevice() && SubClausesPresent) {
9914 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9915 "if lowerbound is non-null, then upperbound must also be non-null "
9916 "for bounds on num_teams");
9917
9918 if (NumTeamsUpper == nullptr)
9919 NumTeamsUpper = Builder.getInt32(0);
9920
9921 if (NumTeamsLower == nullptr)
9922 NumTeamsLower = NumTeamsUpper;
9923
9924 if (IfExpr) {
9925 assert(IfExpr->getType()->isIntegerTy() &&
9926 "argument to if clause must be an integer value");
9927
9928 // upper = ifexpr ? upper : 1
9929 if (IfExpr->getType() != Int1)
9930 IfExpr = Builder.CreateICmpNE(IfExpr,
9931 ConstantInt::get(IfExpr->getType(), 0));
9932 NumTeamsUpper = Builder.CreateSelect(
9933 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9934
9935 // lower = ifexpr ? lower : 1
9936 NumTeamsLower = Builder.CreateSelect(
9937 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9938 }
9939
9940 if (ThreadLimit == nullptr)
9941 ThreadLimit = Builder.getInt32(0);
9942
9943 Value *ThreadNum = getOrCreateThreadID(Ident);
9944 Builder.CreateCall(
9945 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9946 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9947 }
9948 // Generate the body of teams.
9949 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9950 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9951 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9952 return Err;
9953
9954 OutlineInfo OI;
9955 OI.EntryBB = AllocaBB;
9956 OI.ExitBB = ExitBB;
9957 OI.OuterAllocaBB = &OuterAllocaBB;
9958
9959 // Insert fake values for global tid and bound tid.
9961 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9962 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9963 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9964 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9965 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9966
9967 auto HostPostOutlineCB = [this, Ident,
9968 ToBeDeleted](Function &OutlinedFn) mutable {
9969 // The stale call instruction will be replaced with a new call instruction
9970 // for runtime call with the outlined function.
9971
9972 assert(OutlinedFn.hasOneUse() &&
9973 "there must be a single user for the outlined function");
9974 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9975 ToBeDeleted.push_back(StaleCI);
9976
9977 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9978 "Outlined function must have two or three arguments only");
9979
9980 bool HasShared = OutlinedFn.arg_size() == 3;
9981
9982 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9983 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9984 if (HasShared)
9985 OutlinedFn.getArg(2)->setName("data");
9986
9987 // Call to the runtime function for teams in the current function.
9988 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9989 "outlined function.");
9990 Builder.SetInsertPoint(StaleCI);
9992 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9993 if (HasShared)
9994 Args.push_back(StaleCI->getArgOperand(2));
9995 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9996 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9997 Args);
9998
9999 for (Instruction *I : llvm::reverse(ToBeDeleted))
10000 I->eraseFromParent();
10001 };
10002
10003 if (!Config.isTargetDevice())
10004 OI.PostOutlineCB = HostPostOutlineCB;
10005
10006 addOutlineInfo(std::move(OI));
10007
10008 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10009
10010 return Builder.saveIP();
10011}
10012
10013OpenMPIRBuilder::InsertPointOrErrorTy
10014OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10015 InsertPointTy OuterAllocaIP,
10016 BodyGenCallbackTy BodyGenCB) {
10017 if (!updateToLocation(Loc))
10018 return InsertPointTy();
10019
10020 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10021
10022 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10023 BasicBlock *BodyBB =
10024 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10025 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10026 }
10027 BasicBlock *ExitBB =
10028 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10029 BasicBlock *BodyBB =
10030 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10031 BasicBlock *AllocaBB =
10032 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10033
10034 // Generate the body of distribute clause
10035 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10036 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10037 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10038 return Err;
10039
10040 // When using target we use different runtime functions which require a
10041 // callback.
10042 if (Config.isTargetDevice()) {
10043 OutlineInfo OI;
10044 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10045 OI.EntryBB = AllocaBB;
10046 OI.ExitBB = ExitBB;
10047
10048 addOutlineInfo(std::move(OI));
10049 }
10050 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10051
10052 return Builder.saveIP();
10053}
10054
10056OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10057 std::string VarName) {
10058 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10060 Names.size()),
10061 Names);
10062 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10063 M, MapNamesArrayInit->getType(),
10064 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10065 VarName);
10066 return MapNamesArrayGlobal;
10067}
10068
10069// Create all simple and struct types exposed by the runtime and remember
10070// the llvm::PointerTypes of them for easy access later.
10071void OpenMPIRBuilder::initializeTypes(Module &M) {
10072 LLVMContext &Ctx = M.getContext();
10073 StructType *T;
10074 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10075#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10076#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10077 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10078 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10079#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10080 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10081 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10082#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10083 T = StructType::getTypeByName(Ctx, StructName); \
10084 if (!T) \
10085 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10086 VarName = T; \
10087 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10088#include "llvm/Frontend/OpenMP/OMPKinds.def"
10089}
10090
10091void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10093 SmallVectorImpl<BasicBlock *> &BlockVector) {
10095 BlockSet.insert(EntryBB);
10096 BlockSet.insert(ExitBB);
10097
10098 Worklist.push_back(EntryBB);
10099 while (!Worklist.empty()) {
10100 BasicBlock *BB = Worklist.pop_back_val();
10101 BlockVector.push_back(BB);
10102 for (BasicBlock *SuccBB : successors(BB))
10103 if (BlockSet.insert(SuccBB).second)
10104 Worklist.push_back(SuccBB);
10105 }
10106}
10107
10108void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10109 uint64_t Size, int32_t Flags,
10111 StringRef Name) {
10112 if (!Config.isGPU()) {
10115 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10116 return;
10117 }
10118 // TODO: Add support for global variables on the device after declare target
10119 // support.
10120 Function *Fn = dyn_cast<Function>(Addr);
10121 if (!Fn)
10122 return;
10123
10124 // Add a function attribute for the kernel.
10125 Fn->addFnAttr("kernel");
10126 if (T.isAMDGCN())
10127 Fn->addFnAttr("uniform-work-group-size", "true");
10128 Fn->addFnAttr(Attribute::MustProgress);
10129}
10130
10131// We only generate metadata for function that contain target regions.
10132void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10133 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10134
10135 // If there are no entries, we don't need to do anything.
10136 if (OffloadInfoManager.empty())
10137 return;
10138
10139 LLVMContext &C = M.getContext();
10140 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10141 TargetRegionEntryInfo>,
10142 16>
10143 OrderedEntries(OffloadInfoManager.size());
10144
10145 // Auxiliary methods to create metadata values and strings.
10146 auto &&GetMDInt = [this](unsigned V) {
10147 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10148 };
10149
10150 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10151
10152 // Create the offloading info metadata node.
10153 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10154 auto &&TargetRegionMetadataEmitter =
10155 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10156 const TargetRegionEntryInfo &EntryInfo,
10157 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10158 // Generate metadata for target regions. Each entry of this metadata
10159 // contains:
10160 // - Entry 0 -> Kind of this type of metadata (0).
10161 // - Entry 1 -> Device ID of the file where the entry was identified.
10162 // - Entry 2 -> File ID of the file where the entry was identified.
10163 // - Entry 3 -> Mangled name of the function where the entry was
10164 // identified.
10165 // - Entry 4 -> Line in the file where the entry was identified.
10166 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10167 // - Entry 6 -> Order the entry was created.
10168 // The first element of the metadata node is the kind.
10169 Metadata *Ops[] = {
10170 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10171 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10172 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10173 GetMDInt(E.getOrder())};
10174
10175 // Save this entry in the right position of the ordered entries array.
10176 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10177
10178 // Add metadata to the named metadata node.
10179 MD->addOperand(MDNode::get(C, Ops));
10180 };
10181
10182 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10183
10184 // Create function that emits metadata for each device global variable entry;
10185 auto &&DeviceGlobalVarMetadataEmitter =
10186 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10187 StringRef MangledName,
10188 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10189 // Generate metadata for global variables. Each entry of this metadata
10190 // contains:
10191 // - Entry 0 -> Kind of this type of metadata (1).
10192 // - Entry 1 -> Mangled name of the variable.
10193 // - Entry 2 -> Declare target kind.
10194 // - Entry 3 -> Order the entry was created.
10195 // The first element of the metadata node is the kind.
10196 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10197 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10198
10199 // Save this entry in the right position of the ordered entries array.
10200 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10201 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10202
10203 // Add metadata to the named metadata node.
10204 MD->addOperand(MDNode::get(C, Ops));
10205 };
10206
10207 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10208 DeviceGlobalVarMetadataEmitter);
10209
10210 for (const auto &E : OrderedEntries) {
10211 assert(E.first && "All ordered entries must exist!");
10212 if (const auto *CE =
10214 E.first)) {
10215 if (!CE->getID() || !CE->getAddress()) {
10216 // Do not blame the entry if the parent funtion is not emitted.
10217 TargetRegionEntryInfo EntryInfo = E.second;
10218 StringRef FnName = EntryInfo.ParentName;
10219 if (!M.getNamedValue(FnName))
10220 continue;
10221 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10222 continue;
10223 }
10224 createOffloadEntry(CE->getID(), CE->getAddress(),
10225 /*Size=*/0, CE->getFlags(),
10227 } else if (const auto *CE = dyn_cast<
10228 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10229 E.first)) {
10230 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10231 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10232 CE->getFlags());
10233 switch (Flags) {
10234 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10235 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10236 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10237 continue;
10238 if (!CE->getAddress()) {
10239 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10240 continue;
10241 }
10242 // The vaiable has no definition - no need to add the entry.
10243 if (CE->getVarSize() == 0)
10244 continue;
10245 break;
10246 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10247 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10248 (!Config.isTargetDevice() && CE->getAddress())) &&
10249 "Declaret target link address is set.");
10250 if (Config.isTargetDevice())
10251 continue;
10252 if (!CE->getAddress()) {
10253 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10254 continue;
10255 }
10256 break;
10257 default:
10258 break;
10259 }
10260
10261 // Hidden or internal symbols on the device are not externally visible.
10262 // We should not attempt to register them by creating an offloading
10263 // entry. Indirect variables are handled separately on the device.
10264 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10265 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10266 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10267 continue;
10268
10269 // Indirect globals need to use a special name that doesn't match the name
10270 // of the associated host global.
10271 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10272 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10273 Flags, CE->getLinkage(), CE->getVarName());
10274 else
10275 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10276 Flags, CE->getLinkage());
10277
10278 } else {
10279 llvm_unreachable("Unsupported entry kind.");
10280 }
10281 }
10282
10283 // Emit requires directive globals to a special entry so the runtime can
10284 // register them when the device image is loaded.
10285 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10286 // entries should be redesigned to better suit this use-case.
10287 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10291 ".requires", /*Size=*/0,
10292 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10293 Config.getRequiresFlags());
10294}
10295
10296void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10297 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10298 unsigned FileID, unsigned Line, unsigned Count) {
10299 raw_svector_ostream OS(Name);
10300 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10301 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10302 if (Count)
10303 OS << "_" << Count;
10304}
10305
10306void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10307 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10308 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10309 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10310 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10311 EntryInfo.Line, NewCount);
10312}
10313
10314TargetRegionEntryInfo
10315OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10316 vfs::FileSystem &VFS,
10317 StringRef ParentName) {
10318 sys::fs::UniqueID ID(0xdeadf17e, 0);
10319 auto FileIDInfo = CallBack();
10320 uint64_t FileID = 0;
10321 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10322 ID = Status->getUniqueID();
10323 FileID = Status->getUniqueID().getFile();
10324 } else {
10325 // If the inode ID could not be determined, create a hash value
10326 // the current file name and use that as an ID.
10327 FileID = hash_value(std::get<0>(FileIDInfo));
10328 }
10329
10330 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10331 std::get<1>(FileIDInfo));
10332}
10333
10334unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10335 unsigned Offset = 0;
10336 for (uint64_t Remain =
10337 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10339 !(Remain & 1); Remain = Remain >> 1)
10340 Offset++;
10341 return Offset;
10342}
10343
10345OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10346 // Rotate by getFlagMemberOffset() bits.
10347 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10348 << getFlagMemberOffset());
10349}
10350
10351void OpenMPIRBuilder::setCorrectMemberOfFlag(
10353 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10354 // If the entry is PTR_AND_OBJ but has not been marked with the special
10355 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10356 // marked as MEMBER_OF.
10357 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10359 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10362 return;
10363
10364 // Reset the placeholder value to prepare the flag for the assignment of the
10365 // proper MEMBER_OF value.
10366 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10367 Flags |= MemberOfFlag;
10368}
10369
10370Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10371 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10372 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10373 bool IsDeclaration, bool IsExternallyVisible,
10374 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10375 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10376 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10377 std::function<Constant *()> GlobalInitializer,
10378 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10379 // TODO: convert this to utilise the IRBuilder Config rather than
10380 // a passed down argument.
10381 if (OpenMPSIMD)
10382 return nullptr;
10383
10384 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10385 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10386 CaptureClause ==
10387 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10388 Config.hasRequiresUnifiedSharedMemory())) {
10389 SmallString<64> PtrName;
10390 {
10391 raw_svector_ostream OS(PtrName);
10392 OS << MangledName;
10393 if (!IsExternallyVisible)
10394 OS << format("_%x", EntryInfo.FileID);
10395 OS << "_decl_tgt_ref_ptr";
10396 }
10397
10398 Value *Ptr = M.getNamedValue(PtrName);
10399
10400 if (!Ptr) {
10401 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10402 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10403
10404 auto *GV = cast<GlobalVariable>(Ptr);
10405 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10406
10407 if (!Config.isTargetDevice()) {
10408 if (GlobalInitializer)
10409 GV->setInitializer(GlobalInitializer());
10410 else
10411 GV->setInitializer(GlobalValue);
10412 }
10413
10414 registerTargetGlobalVariable(
10415 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10416 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10417 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10418 }
10419
10420 return cast<Constant>(Ptr);
10421 }
10422
10423 return nullptr;
10424}
10425
10426void OpenMPIRBuilder::registerTargetGlobalVariable(
10427 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10428 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10429 bool IsDeclaration, bool IsExternallyVisible,
10430 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10431 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10432 std::vector<Triple> TargetTriple,
10433 std::function<Constant *()> GlobalInitializer,
10434 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10435 Constant *Addr) {
10436 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10437 (TargetTriple.empty() && !Config.isTargetDevice()))
10438 return;
10439
10440 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10442 int64_t VarSize;
10444
10445 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10446 CaptureClause ==
10447 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10448 !Config.hasRequiresUnifiedSharedMemory()) {
10449 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10450 VarName = MangledName;
10451 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10452
10453 if (!IsDeclaration)
10454 VarSize = divideCeil(
10455 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10456 else
10457 VarSize = 0;
10458 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10459
10460 // This is a workaround carried over from Clang which prevents undesired
10461 // optimisation of internal variables.
10462 if (Config.isTargetDevice() &&
10463 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10464 // Do not create a "ref-variable" if the original is not also available
10465 // on the host.
10466 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10467 return;
10468
10469 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10470
10471 if (!M.getNamedValue(RefName)) {
10472 Constant *AddrRef =
10473 getOrCreateInternalVariable(Addr->getType(), RefName);
10474 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10475 GvAddrRef->setConstant(true);
10476 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10477 GvAddrRef->setInitializer(Addr);
10478 GeneratedRefs.push_back(GvAddrRef);
10479 }
10480 }
10481 } else {
10482 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10483 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10484 else
10485 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10486
10487 if (Config.isTargetDevice()) {
10488 VarName = (Addr) ? Addr->getName() : "";
10489 Addr = nullptr;
10490 } else {
10491 Addr = getAddrOfDeclareTargetVar(
10492 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10493 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10494 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10495 VarName = (Addr) ? Addr->getName() : "";
10496 }
10497 VarSize = M.getDataLayout().getPointerSize();
10499 }
10500
10501 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10502 Flags, Linkage);
10503}
10504
10505/// Loads all the offload entries information from the host IR
10506/// metadata.
10507void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10508 // If we are in target mode, load the metadata from the host IR. This code has
10509 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10510
10511 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10512 if (!MD)
10513 return;
10514
10515 for (MDNode *MN : MD->operands()) {
10516 auto &&GetMDInt = [MN](unsigned Idx) {
10517 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10518 return cast<ConstantInt>(V->getValue())->getZExtValue();
10519 };
10520
10521 auto &&GetMDString = [MN](unsigned Idx) {
10522 auto *V = cast<MDString>(MN->getOperand(Idx));
10523 return V->getString();
10524 };
10525
10526 switch (GetMDInt(0)) {
10527 default:
10528 llvm_unreachable("Unexpected metadata!");
10529 break;
10530 case OffloadEntriesInfoManager::OffloadEntryInfo::
10531 OffloadingEntryInfoTargetRegion: {
10532 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10533 /*DeviceID=*/GetMDInt(1),
10534 /*FileID=*/GetMDInt(2),
10535 /*Line=*/GetMDInt(4),
10536 /*Count=*/GetMDInt(5));
10537 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10538 /*Order=*/GetMDInt(6));
10539 break;
10540 }
10541 case OffloadEntriesInfoManager::OffloadEntryInfo::
10542 OffloadingEntryInfoDeviceGlobalVar:
10543 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10544 /*MangledName=*/GetMDString(1),
10545 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10546 /*Flags=*/GetMDInt(2)),
10547 /*Order=*/GetMDInt(3));
10548 break;
10549 }
10550 }
10551}
10552
10553void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10554 StringRef HostFilePath) {
10555 if (HostFilePath.empty())
10556 return;
10557
10558 auto Buf = VFS.getBufferForFile(HostFilePath);
10559 if (std::error_code Err = Buf.getError()) {
10560 report_fatal_error(("error opening host file from host file path inside of "
10561 "OpenMPIRBuilder: " +
10562 Err.message())
10563 .c_str());
10564 }
10565
10566 LLVMContext Ctx;
10568 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10569 if (std::error_code Err = M.getError()) {
10571 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10572 .c_str());
10573 }
10574
10575 loadOffloadInfoMetadata(*M.get());
10576}
10577
10578//===----------------------------------------------------------------------===//
10579// OffloadEntriesInfoManager
10580//===----------------------------------------------------------------------===//
10581
10582bool OffloadEntriesInfoManager::empty() const {
10583 return OffloadEntriesTargetRegion.empty() &&
10584 OffloadEntriesDeviceGlobalVar.empty();
10585}
10586
10587unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10588 const TargetRegionEntryInfo &EntryInfo) const {
10589 auto It = OffloadEntriesTargetRegionCount.find(
10590 getTargetRegionEntryCountKey(EntryInfo));
10591 if (It == OffloadEntriesTargetRegionCount.end())
10592 return 0;
10593 return It->second;
10594}
10595
10596void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10597 const TargetRegionEntryInfo &EntryInfo) {
10598 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10599 EntryInfo.Count + 1;
10600}
10601
10602/// Initialize target region entry.
10603void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10604 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10605 OffloadEntriesTargetRegion[EntryInfo] =
10606 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10607 OMPTargetRegionEntryTargetRegion);
10608 ++OffloadingEntriesNum;
10609}
10610
10611void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10612 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10613 OMPTargetRegionEntryKind Flags) {
10614 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10615
10616 // Update the EntryInfo with the next available count for this location.
10617 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10618
10619 // If we are emitting code for a target, the entry is already initialized,
10620 // only has to be registered.
10621 if (OMPBuilder->Config.isTargetDevice()) {
10622 // This could happen if the device compilation is invoked standalone.
10623 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10624 return;
10625 }
10626 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10627 Entry.setAddress(Addr);
10628 Entry.setID(ID);
10629 Entry.setFlags(Flags);
10630 } else {
10631 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10632 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10633 return;
10634 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10635 "Target region entry already registered!");
10636 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10637 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10638 ++OffloadingEntriesNum;
10639 }
10640 incrementTargetRegionEntryInfoCount(EntryInfo);
10641}
10642
10643bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10644 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10645
10646 // Update the EntryInfo with the next available count for this location.
10647 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10648
10649 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10650 if (It == OffloadEntriesTargetRegion.end()) {
10651 return false;
10652 }
10653 // Fail if this entry is already registered.
10654 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10655 return false;
10656 return true;
10657}
10658
10659void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10660 const OffloadTargetRegionEntryInfoActTy &Action) {
10661 // Scan all target region entries and perform the provided action.
10662 for (const auto &It : OffloadEntriesTargetRegion) {
10663 Action(It.first, It.second);
10664 }
10665}
10666
10667void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10668 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10669 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10670 ++OffloadingEntriesNum;
10671}
10672
10673void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10674 StringRef VarName, Constant *Addr, int64_t VarSize,
10675 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10676 if (OMPBuilder->Config.isTargetDevice()) {
10677 // This could happen if the device compilation is invoked standalone.
10678 if (!hasDeviceGlobalVarEntryInfo(VarName))
10679 return;
10680 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10681 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10682 if (Entry.getVarSize() == 0) {
10683 Entry.setVarSize(VarSize);
10684 Entry.setLinkage(Linkage);
10685 }
10686 return;
10687 }
10688 Entry.setVarSize(VarSize);
10689 Entry.setLinkage(Linkage);
10690 Entry.setAddress(Addr);
10691 } else {
10692 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10693 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10694 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10695 "Entry not initialized!");
10696 if (Entry.getVarSize() == 0) {
10697 Entry.setVarSize(VarSize);
10698 Entry.setLinkage(Linkage);
10699 }
10700 return;
10701 }
10702 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10703 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10704 Addr, VarSize, Flags, Linkage,
10705 VarName.str());
10706 else
10707 OffloadEntriesDeviceGlobalVar.try_emplace(
10708 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10709 ++OffloadingEntriesNum;
10710 }
10711}
10712
10713void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10714 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10715 // Scan all target region entries and perform the provided action.
10716 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10717 Action(E.getKey(), E.getValue());
10718}
10719
10720//===----------------------------------------------------------------------===//
10721// CanonicalLoopInfo
10722//===----------------------------------------------------------------------===//
10723
10724void CanonicalLoopInfo::collectControlBlocks(
10726 // We only count those BBs as control block for which we do not need to
10727 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10728 // flow. For consistency, this also means we do not add the Body block, which
10729 // is just the entry to the body code.
10730 BBs.reserve(BBs.size() + 6);
10731 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10732}
10733
10734BasicBlock *CanonicalLoopInfo::getPreheader() const {
10735 assert(isValid() && "Requires a valid canonical loop");
10736 for (BasicBlock *Pred : predecessors(Header)) {
10737 if (Pred != Latch)
10738 return Pred;
10739 }
10740 llvm_unreachable("Missing preheader");
10741}
10742
10743void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10744 assert(isValid() && "Requires a valid canonical loop");
10745
10746 Instruction *CmpI = &getCond()->front();
10747 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10748 CmpI->setOperand(1, TripCount);
10749
10750#ifndef NDEBUG
10751 assertOK();
10752#endif
10753}
10754
10755void CanonicalLoopInfo::mapIndVar(
10756 llvm::function_ref<Value *(Instruction *)> Updater) {
10757 assert(isValid() && "Requires a valid canonical loop");
10758
10759 Instruction *OldIV = getIndVar();
10760
10761 // Record all uses excluding those introduced by the updater. Uses by the
10762 // CanonicalLoopInfo itself to keep track of the number of iterations are
10763 // excluded.
10764 SmallVector<Use *> ReplacableUses;
10765 for (Use &U : OldIV->uses()) {
10766 auto *User = dyn_cast<Instruction>(U.getUser());
10767 if (!User)
10768 continue;
10769 if (User->getParent() == getCond())
10770 continue;
10771 if (User->getParent() == getLatch())
10772 continue;
10773 ReplacableUses.push_back(&U);
10774 }
10775
10776 // Run the updater that may introduce new uses
10777 Value *NewIV = Updater(OldIV);
10778
10779 // Replace the old uses with the value returned by the updater.
10780 for (Use *U : ReplacableUses)
10781 U->set(NewIV);
10782
10783#ifndef NDEBUG
10784 assertOK();
10785#endif
10786}
10787
10788void CanonicalLoopInfo::assertOK() const {
10789#ifndef NDEBUG
10790 // No constraints if this object currently does not describe a loop.
10791 if (!isValid())
10792 return;
10793
10794 BasicBlock *Preheader = getPreheader();
10795 BasicBlock *Body = getBody();
10796 BasicBlock *After = getAfter();
10797
10798 // Verify standard control-flow we use for OpenMP loops.
10799 assert(Preheader);
10800 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10801 "Preheader must terminate with unconditional branch");
10802 assert(Preheader->getSingleSuccessor() == Header &&
10803 "Preheader must jump to header");
10804
10805 assert(Header);
10806 assert(isa<BranchInst>(Header->getTerminator()) &&
10807 "Header must terminate with unconditional branch");
10808 assert(Header->getSingleSuccessor() == Cond &&
10809 "Header must jump to exiting block");
10810
10811 assert(Cond);
10812 assert(Cond->getSinglePredecessor() == Header &&
10813 "Exiting block only reachable from header");
10814
10815 assert(isa<BranchInst>(Cond->getTerminator()) &&
10816 "Exiting block must terminate with conditional branch");
10817 assert(size(successors(Cond)) == 2 &&
10818 "Exiting block must have two successors");
10819 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10820 "Exiting block's first successor jump to the body");
10821 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10822 "Exiting block's second successor must exit the loop");
10823
10824 assert(Body);
10825 assert(Body->getSinglePredecessor() == Cond &&
10826 "Body only reachable from exiting block");
10827 assert(!isa<PHINode>(Body->front()));
10828
10829 assert(Latch);
10831 "Latch must terminate with unconditional branch");
10832 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10833 // TODO: To support simple redirecting of the end of the body code that has
10834 // multiple; introduce another auxiliary basic block like preheader and after.
10835 assert(Latch->getSinglePredecessor() != nullptr);
10836 assert(!isa<PHINode>(Latch->front()));
10837
10838 assert(Exit);
10839 assert(isa<BranchInst>(Exit->getTerminator()) &&
10840 "Exit block must terminate with unconditional branch");
10841 assert(Exit->getSingleSuccessor() == After &&
10842 "Exit block must jump to after block");
10843
10844 assert(After);
10845 assert(After->getSinglePredecessor() == Exit &&
10846 "After block only reachable from exit block");
10847 assert(After->empty() || !isa<PHINode>(After->front()));
10848
10849 Instruction *IndVar = getIndVar();
10850 assert(IndVar && "Canonical induction variable not found?");
10851 assert(isa<IntegerType>(IndVar->getType()) &&
10852 "Induction variable must be an integer");
10853 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10854 "Induction variable must be a PHI in the loop header");
10855 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10856 assert(
10857 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10858 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10859
10860 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10861 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10862 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10863 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10864 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10865 ->isOne());
10866
10867 Value *TripCount = getTripCount();
10868 assert(TripCount && "Loop trip count not found?");
10869 assert(IndVar->getType() == TripCount->getType() &&
10870 "Trip count and induction variable must have the same type");
10871
10872 auto *CmpI = cast<CmpInst>(&Cond->front());
10873 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10874 "Exit condition must be a signed less-than comparison");
10875 assert(CmpI->getOperand(0) == IndVar &&
10876 "Exit condition must compare the induction variable");
10877 assert(CmpI->getOperand(1) == TripCount &&
10878 "Exit condition must compare with the trip count");
10879#endif
10880}
10881
10882void CanonicalLoopInfo::invalidate() {
10883 Header = nullptr;
10884 Cond = nullptr;
10885 Latch = nullptr;
10886 Exit = nullptr;
10887}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:523
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1439
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1749
iterator_range< op_iterator > operands()
Definition Metadata.h:1845
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:247
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1118
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:294
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...