LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
60
61#include <cstdint>
62#include <optional>
63
64#define DEBUG_TYPE "openmp-ir-builder"
65
66using namespace llvm;
67using namespace omp;
68
69static cl::opt<bool>
70 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
71 cl::desc("Use optimistic attributes describing "
72 "'as-if' properties of runtime calls."),
73 cl::init(false));
74
76 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
77 cl::desc("Factor for the unroll threshold to account for code "
78 "simplifications still taking place"),
79 cl::init(1.5));
80
81#ifndef NDEBUG
82/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
83/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
84/// an InsertPoint stores the instruction before something is inserted. For
85/// instance, if both point to the same instruction, two IRBuilders alternating
86/// creating instruction will cause the instructions to be interleaved.
89 if (!IP1.isSet() || !IP2.isSet())
90 return false;
91 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
92}
93
95 // Valid ordered/unordered and base algorithm combinations.
96 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
97 case OMPScheduleType::UnorderedStaticChunked:
98 case OMPScheduleType::UnorderedStatic:
99 case OMPScheduleType::UnorderedDynamicChunked:
100 case OMPScheduleType::UnorderedGuidedChunked:
101 case OMPScheduleType::UnorderedRuntime:
102 case OMPScheduleType::UnorderedAuto:
103 case OMPScheduleType::UnorderedTrapezoidal:
104 case OMPScheduleType::UnorderedGreedy:
105 case OMPScheduleType::UnorderedBalanced:
106 case OMPScheduleType::UnorderedGuidedIterativeChunked:
107 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
108 case OMPScheduleType::UnorderedSteal:
109 case OMPScheduleType::UnorderedStaticBalancedChunked:
110 case OMPScheduleType::UnorderedGuidedSimd:
111 case OMPScheduleType::UnorderedRuntimeSimd:
112 case OMPScheduleType::OrderedStaticChunked:
113 case OMPScheduleType::OrderedStatic:
114 case OMPScheduleType::OrderedDynamicChunked:
115 case OMPScheduleType::OrderedGuidedChunked:
116 case OMPScheduleType::OrderedRuntime:
117 case OMPScheduleType::OrderedAuto:
118 case OMPScheduleType::OrderdTrapezoidal:
119 case OMPScheduleType::NomergeUnorderedStaticChunked:
120 case OMPScheduleType::NomergeUnorderedStatic:
121 case OMPScheduleType::NomergeUnorderedDynamicChunked:
122 case OMPScheduleType::NomergeUnorderedGuidedChunked:
123 case OMPScheduleType::NomergeUnorderedRuntime:
124 case OMPScheduleType::NomergeUnorderedAuto:
125 case OMPScheduleType::NomergeUnorderedTrapezoidal:
126 case OMPScheduleType::NomergeUnorderedGreedy:
127 case OMPScheduleType::NomergeUnorderedBalanced:
128 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
129 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
130 case OMPScheduleType::NomergeUnorderedSteal:
131 case OMPScheduleType::NomergeOrderedStaticChunked:
132 case OMPScheduleType::NomergeOrderedStatic:
133 case OMPScheduleType::NomergeOrderedDynamicChunked:
134 case OMPScheduleType::NomergeOrderedGuidedChunked:
135 case OMPScheduleType::NomergeOrderedRuntime:
136 case OMPScheduleType::NomergeOrderedAuto:
137 case OMPScheduleType::NomergeOrderedTrapezoidal:
138 break;
139 default:
140 return false;
141 }
142
143 // Must not set both monotonicity modifiers at the same time.
144 OMPScheduleType MonotonicityFlags =
145 SchedType & OMPScheduleType::MonotonicityMask;
146 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
147 return false;
148
149 return true;
150}
151#endif
152
153/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
154/// debug location to the last instruction in the specified basic block if the
155/// insert point points to the end of the block.
158 Builder.restoreIP(IP);
159 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 if (!BB->empty() && I == BB->end())
163}
164
165static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
166 if (T.isAMDGPU()) {
167 StringRef Features =
168 Kernel->getFnAttribute("target-features").getValueAsString();
169 if (Features.count("+wavefrontsize64"))
170 return omp::getAMDGPUGridValues<64>();
171 return omp::getAMDGPUGridValues<32>();
172 }
173 if (T.isNVPTX())
175 if (T.isSPIRV())
177 llvm_unreachable("No grid value available for this architecture!");
178}
179
180/// Determine which scheduling algorithm to use, determined from schedule clause
181/// arguments.
182static OMPScheduleType
183getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
184 bool HasSimdModifier) {
185 // Currently, the default schedule it static.
186 switch (ClauseKind) {
187 case OMP_SCHEDULE_Default:
188 case OMP_SCHEDULE_Static:
189 return HasChunks ? OMPScheduleType::BaseStaticChunked
190 : OMPScheduleType::BaseStatic;
191 case OMP_SCHEDULE_Dynamic:
192 return OMPScheduleType::BaseDynamicChunked;
193 case OMP_SCHEDULE_Guided:
194 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
195 : OMPScheduleType::BaseGuidedChunked;
196 case OMP_SCHEDULE_Auto:
198 case OMP_SCHEDULE_Runtime:
199 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
200 : OMPScheduleType::BaseRuntime;
201 }
202 llvm_unreachable("unhandled schedule clause argument");
203}
204
205/// Adds ordering modifier flags to schedule type.
206static OMPScheduleType
208 bool HasOrderedClause) {
209 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
210 OMPScheduleType::None &&
211 "Must not have ordering nor monotonicity flags already set");
212
213 OMPScheduleType OrderingModifier = HasOrderedClause
214 ? OMPScheduleType::ModifierOrdered
215 : OMPScheduleType::ModifierUnordered;
216 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
217
218 // Unsupported combinations
219 if (OrderingScheduleType ==
220 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
221 return OMPScheduleType::OrderedGuidedChunked;
222 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
223 OMPScheduleType::ModifierOrdered))
224 return OMPScheduleType::OrderedRuntime;
225
226 return OrderingScheduleType;
227}
228
229/// Adds monotonicity modifier flags to schedule type.
230static OMPScheduleType
232 bool HasSimdModifier, bool HasMonotonic,
233 bool HasNonmonotonic, bool HasOrderedClause) {
234 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
235 OMPScheduleType::None &&
236 "Must not have monotonicity flags already set");
237 assert((!HasMonotonic || !HasNonmonotonic) &&
238 "Monotonic and Nonmonotonic are contradicting each other");
239
240 if (HasMonotonic) {
241 return ScheduleType | OMPScheduleType::ModifierMonotonic;
242 } else if (HasNonmonotonic) {
243 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
244 } else {
245 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
246 // If the static schedule kind is specified or if the ordered clause is
247 // specified, and if the nonmonotonic modifier is not specified, the
248 // effect is as if the monotonic modifier is specified. Otherwise, unless
249 // the monotonic modifier is specified, the effect is as if the
250 // nonmonotonic modifier is specified.
251 OMPScheduleType BaseScheduleType =
252 ScheduleType & ~OMPScheduleType::ModifierMask;
253 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
254 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
255 HasOrderedClause) {
256 // The monotonic is used by default in openmp runtime library, so no need
257 // to set it.
258 return ScheduleType;
259 } else {
260 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
261 }
262 }
263}
264
265/// Determine the schedule type using schedule and ordering clause arguments.
266static OMPScheduleType
267computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
268 bool HasSimdModifier, bool HasMonotonicModifier,
269 bool HasNonmonotonicModifier, bool HasOrderedClause) {
270 OMPScheduleType BaseSchedule =
271 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
272 OMPScheduleType OrderedSchedule =
273 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
275 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
276 HasNonmonotonicModifier, HasOrderedClause);
277
279 return Result;
280}
281
282/// Make \p Source branch to \p Target.
283///
284/// Handles two situations:
285/// * \p Source already has an unconditional branch.
286/// * \p Source is a degenerate block (no terminator because the BB is
287/// the current head of the IR construction).
289 if (Instruction *Term = Source->getTerminator()) {
290 auto *Br = cast<BranchInst>(Term);
291 assert(!Br->isConditional() &&
292 "BB's terminator must be an unconditional branch (or degenerate)");
293 BasicBlock *Succ = Br->getSuccessor(0);
294 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
295 Br->setSuccessor(0, Target);
296 return;
297 }
298
299 auto *NewBr = BranchInst::Create(Target, Source);
300 NewBr->setDebugLoc(DL);
301}
302
304 bool CreateBranch, DebugLoc DL) {
305 assert(New->getFirstInsertionPt() == New->begin() &&
306 "Target BB must not have PHI nodes");
307
308 // Move instructions to new block.
309 BasicBlock *Old = IP.getBlock();
310 // If the `Old` block is empty then there are no instructions to move. But in
311 // the new debug scheme, it could have trailing debug records which will be
312 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
313 // reasons:
314 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
315 // 2. Even if `New` is not empty, the rationale to move those records to `New`
316 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
317 // assumes that `Old` is optimized out and is going away. This is not the case
318 // here. The `Old` block is still being used e.g. a branch instruction is
319 // added to it later in this function.
320 // So we call `BasicBlock::splice` only when `Old` is not empty.
321 if (!Old->empty())
322 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
323
324 if (CreateBranch) {
325 auto *NewBr = BranchInst::Create(New, Old);
326 NewBr->setDebugLoc(DL);
327 }
328}
329
330void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 BasicBlock *Old = Builder.GetInsertBlock();
333
334 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
335 if (CreateBranch)
336 Builder.SetInsertPoint(Old->getTerminator());
337 else
338 Builder.SetInsertPoint(Old);
339
340 // SetInsertPoint also updates the Builder's debug location, but we want to
341 // keep the one the Builder was configured to use.
343}
344
347 BasicBlock *Old = IP.getBlock();
349 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
350 Old->getParent(), Old->getNextNode());
351 spliceBB(IP, New, CreateBranch, DL);
352 New->replaceSuccessorsPhiUsesWith(Old, New);
353 return New;
354}
355
356BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
359 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
360 if (CreateBranch)
361 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
362 else
363 Builder.SetInsertPoint(Builder.GetInsertBlock());
364 // SetInsertPoint also updates the Builder's debug location, but we want to
365 // keep the one the Builder was configured to use.
367 return New;
368}
369
370BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
373 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
374 if (CreateBranch)
375 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
376 else
377 Builder.SetInsertPoint(Builder.GetInsertBlock());
378 // SetInsertPoint also updates the Builder's debug location, but we want to
379 // keep the one the Builder was configured to use.
381 return New;
382}
383
385 llvm::Twine Suffix) {
386 BasicBlock *Old = Builder.GetInsertBlock();
387 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
388}
389
390// This function creates a fake integer value and a fake use for the integer
391// value. It returns the fake value created. This is useful in modeling the
392// extra arguments to the outlined functions.
394 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
396 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
397 const Twine &Name = "", bool AsPtr = true) {
398 Builder.restoreIP(OuterAllocaIP);
399 Instruction *FakeVal;
400 AllocaInst *FakeValAddr =
401 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
402 ToBeDeleted.push_back(FakeValAddr);
403
404 if (AsPtr) {
405 FakeVal = FakeValAddr;
406 } else {
407 FakeVal =
408 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
409 ToBeDeleted.push_back(FakeVal);
410 }
411
412 // Generate a fake use of this value
413 Builder.restoreIP(InnerAllocaIP);
414 Instruction *UseFakeVal;
415 if (AsPtr) {
416 UseFakeVal =
417 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
418 } else {
419 UseFakeVal =
420 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
421 }
422 ToBeDeleted.push_back(UseFakeVal);
423 return FakeVal;
424}
425
426//===----------------------------------------------------------------------===//
427// OpenMPIRBuilderConfig
428//===----------------------------------------------------------------------===//
429
430namespace {
432/// Values for bit flags for marking which requires clauses have been used.
433enum OpenMPOffloadingRequiresDirFlags {
434 /// flag undefined.
435 OMP_REQ_UNDEFINED = 0x000,
436 /// no requires directive present.
437 OMP_REQ_NONE = 0x001,
438 /// reverse_offload clause.
439 OMP_REQ_REVERSE_OFFLOAD = 0x002,
440 /// unified_address clause.
441 OMP_REQ_UNIFIED_ADDRESS = 0x004,
442 /// unified_shared_memory clause.
443 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
444 /// dynamic_allocators clause.
445 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
446 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
447};
448
449} // anonymous namespace
450
452 : RequiresFlags(OMP_REQ_UNDEFINED) {}
453
455 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
456 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
457 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
458 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
459 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
460 RequiresFlags(OMP_REQ_UNDEFINED) {
461 if (HasRequiresReverseOffload)
462 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
463 if (HasRequiresUnifiedAddress)
464 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
465 if (HasRequiresUnifiedSharedMemory)
466 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
467 if (HasRequiresDynamicAllocators)
468 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
469}
470
472 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
473}
474
476 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
477}
478
480 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
485}
486
488 return hasRequiresFlags() ? RequiresFlags
489 : static_cast<int64_t>(OMP_REQ_NONE);
490}
491
493 if (Value)
494 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
495 else
496 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
497}
498
500 if (Value)
501 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
502 else
503 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
504}
505
507 if (Value)
508 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
509 else
510 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
511}
512
514 if (Value)
515 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
516 else
517 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
518}
519
520//===----------------------------------------------------------------------===//
521// OpenMPIRBuilder
522//===----------------------------------------------------------------------===//
523
525 IRBuilderBase &Builder,
526 SmallVector<Value *> &ArgsVector) {
528 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
529 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
530 constexpr const size_t MaxDim = 3;
531 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
532 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
533
534 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
535
536 Value *NumTeams3D =
537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
538 Value *NumThreads3D =
539 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
540 for (unsigned I :
541 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
542 NumTeams3D =
543 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
544 for (unsigned I :
545 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
546 NumThreads3D =
547 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
548
549 ArgsVector = {Version,
550 PointerNum,
551 KernelArgs.RTArgs.BasePointersArray,
552 KernelArgs.RTArgs.PointersArray,
553 KernelArgs.RTArgs.SizesArray,
554 KernelArgs.RTArgs.MapTypesArray,
555 KernelArgs.RTArgs.MapNamesArray,
556 KernelArgs.RTArgs.MappersArray,
557 KernelArgs.NumIterations,
558 Flags,
559 NumTeams3D,
560 NumThreads3D,
561 KernelArgs.DynCGGroupMem};
562}
563
565 LLVMContext &Ctx = Fn.getContext();
566
567 // Get the function's current attributes.
568 auto Attrs = Fn.getAttributes();
569 auto FnAttrs = Attrs.getFnAttrs();
570 auto RetAttrs = Attrs.getRetAttrs();
572 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
573 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
574
575 // Add AS to FnAS while taking special care with integer extensions.
576 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
577 bool Param = true) -> void {
578 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
579 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
580 if (HasSignExt || HasZeroExt) {
581 assert(AS.getNumAttributes() == 1 &&
582 "Currently not handling extension attr combined with others.");
583 if (Param) {
584 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
585 FnAS = FnAS.addAttribute(Ctx, AK);
586 } else if (auto AK =
587 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
588 FnAS = FnAS.addAttribute(Ctx, AK);
589 } else {
590 FnAS = FnAS.addAttributes(Ctx, AS);
591 }
592 };
593
594#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
595#include "llvm/Frontend/OpenMP/OMPKinds.def"
596
597 // Add attributes to the function declaration.
598 switch (FnID) {
599#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
600 case Enum: \
601 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
602 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
603 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
604 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
605 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
606 break;
607#include "llvm/Frontend/OpenMP/OMPKinds.def"
608 default:
609 // Attributes are optional.
610 break;
611 }
612}
613
616 FunctionType *FnTy = nullptr;
617 Function *Fn = nullptr;
618
619 // Try to find the declation in the module first.
620 switch (FnID) {
621#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
622 case Enum: \
623 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
624 IsVarArg); \
625 Fn = M.getFunction(Str); \
626 break;
627#include "llvm/Frontend/OpenMP/OMPKinds.def"
628 }
629
630 if (!Fn) {
631 // Create a new declaration if we need one.
632 switch (FnID) {
633#define OMP_RTL(Enum, Str, ...) \
634 case Enum: \
635 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
636 break;
637#include "llvm/Frontend/OpenMP/OMPKinds.def"
638 }
639
640 // Add information if the runtime function takes a callback function
641 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
642 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
643 LLVMContext &Ctx = Fn->getContext();
644 MDBuilder MDB(Ctx);
645 // Annotate the callback behavior of the runtime function:
646 // - The callback callee is argument number 2 (microtask).
647 // - The first two arguments of the callback callee are unknown (-1).
648 // - All variadic arguments to the runtime function are passed to the
649 // callback callee.
650 Fn->addMetadata(
651 LLVMContext::MD_callback,
653 2, {-1, -1}, /* VarArgsArePassed */ true)}));
654 }
655 }
656
657 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
658 << " with type " << *Fn->getFunctionType() << "\n");
659 addAttributes(FnID, *Fn);
660
661 } else {
662 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
663 << " with type " << *Fn->getFunctionType() << "\n");
664 }
665
666 assert(Fn && "Failed to create OpenMP runtime function");
667
668 return {FnTy, Fn};
669}
670
673 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
674 assert(Fn && "Failed to create OpenMP runtime function pointer");
675 return Fn;
676}
677
678void OpenMPIRBuilder::initialize() { initializeTypes(M); }
679
682 BasicBlock &EntryBlock = Function->getEntryBlock();
683 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
684
685 // Loop over blocks looking for constant allocas, skipping the entry block
686 // as any allocas there are already in the desired location.
687 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
688 Block++) {
689 for (auto Inst = Block->getReverseIterator()->begin();
690 Inst != Block->getReverseIterator()->end();) {
691 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
692 Inst++;
693 if (!isa<ConstantData>(AllocaInst->getArraySize()))
694 continue;
695 AllocaInst->moveBeforePreserving(MoveLocInst);
696 } else {
697 Inst++;
698 }
699 }
700 }
701}
702
704 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
706 SmallVector<OutlineInfo, 16> DeferredOutlines;
707 for (OutlineInfo &OI : OutlineInfos) {
708 // Skip functions that have not finalized yet; may happen with nested
709 // function generation.
710 if (Fn && OI.getFunction() != Fn) {
711 DeferredOutlines.push_back(OI);
712 continue;
713 }
714
715 ParallelRegionBlockSet.clear();
716 Blocks.clear();
717 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
718
719 Function *OuterFn = OI.getFunction();
720 CodeExtractorAnalysisCache CEAC(*OuterFn);
721 // If we generate code for the target device, we need to allocate
722 // struct for aggregate params in the device default alloca address space.
723 // OpenMP runtime requires that the params of the extracted functions are
724 // passed as zero address space pointers. This flag ensures that
725 // CodeExtractor generates correct code for extracted functions
726 // which are used by OpenMP runtime.
727 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
728 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
729 /* AggregateArgs */ true,
730 /* BlockFrequencyInfo */ nullptr,
731 /* BranchProbabilityInfo */ nullptr,
732 /* AssumptionCache */ nullptr,
733 /* AllowVarArgs */ true,
734 /* AllowAlloca */ true,
735 /* AllocaBlock*/ OI.OuterAllocaBB,
736 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
737
738 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
739 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
740 << " Exit: " << OI.ExitBB->getName() << "\n");
741 assert(Extractor.isEligible() &&
742 "Expected OpenMP outlining to be possible!");
743
744 for (auto *V : OI.ExcludeArgsFromAggregate)
745 Extractor.excludeArgFromAggregate(V);
746
747 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
748
749 // Forward target-cpu, target-features attributes to the outlined function.
750 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
751 if (TargetCpuAttr.isStringAttribute())
752 OutlinedFn->addFnAttr(TargetCpuAttr);
753
754 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
755 if (TargetFeaturesAttr.isStringAttribute())
756 OutlinedFn->addFnAttr(TargetFeaturesAttr);
757
758 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
759 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
760 assert(OutlinedFn->getReturnType()->isVoidTy() &&
761 "OpenMP outlined functions should not return a value!");
762
763 // For compability with the clang CG we move the outlined function after the
764 // one with the parallel region.
765 OutlinedFn->removeFromParent();
766 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
767
768 // Remove the artificial entry introduced by the extractor right away, we
769 // made our own entry block after all.
770 {
771 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
772 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
773 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
774 // Move instructions from the to-be-deleted ArtificialEntry to the entry
775 // basic block of the parallel region. CodeExtractor generates
776 // instructions to unwrap the aggregate argument and may sink
777 // allocas/bitcasts for values that are solely used in the outlined region
778 // and do not escape.
779 assert(!ArtificialEntry.empty() &&
780 "Expected instructions to add in the outlined region entry");
781 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
782 End = ArtificialEntry.rend();
783 It != End;) {
784 Instruction &I = *It;
785 It++;
786
787 if (I.isTerminator()) {
788 // Absorb any debug value that terminator may have
789 if (OI.EntryBB->getTerminator())
790 OI.EntryBB->getTerminator()->adoptDbgRecords(
791 &ArtificialEntry, I.getIterator(), false);
792 continue;
793 }
794
795 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
796 }
797
798 OI.EntryBB->moveBefore(&ArtificialEntry);
799 ArtificialEntry.eraseFromParent();
800 }
801 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
802 assert(OutlinedFn && OutlinedFn->hasNUses(1));
803
804 // Run a user callback, e.g. to add attributes.
805 if (OI.PostOutlineCB)
806 OI.PostOutlineCB(*OutlinedFn);
807 }
808
809 // Remove work items that have been completed.
810 OutlineInfos = std::move(DeferredOutlines);
811
812 // The createTarget functions embeds user written code into
813 // the target region which may inject allocas which need to
814 // be moved to the entry block of our target or risk malformed
815 // optimisations by later passes, this is only relevant for
816 // the device pass which appears to be a little more delicate
817 // when it comes to optimisations (however, we do not block on
818 // that here, it's up to the inserter to the list to do so).
819 // This notbaly has to occur after the OutlinedInfo candidates
820 // have been extracted so we have an end product that will not
821 // be implicitly adversely affected by any raises unless
822 // intentionally appended to the list.
823 // NOTE: This only does so for ConstantData, it could be extended
824 // to ConstantExpr's with further effort, however, they should
825 // largely be folded when they get here. Extending it to runtime
826 // defined/read+writeable allocation sizes would be non-trivial
827 // (need to factor in movement of any stores to variables the
828 // allocation size depends on, as well as the usual loads,
829 // otherwise it'll yield the wrong result after movement) and
830 // likely be more suitable as an LLVM optimisation pass.
833
834 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
835 [](EmitMetadataErrorKind Kind,
836 const TargetRegionEntryInfo &EntryInfo) -> void {
837 errs() << "Error of kind: " << Kind
838 << " when emitting offload entries and metadata during "
839 "OMPIRBuilder finalization \n";
840 };
841
844
845 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
846 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
847 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
848 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
849 }
850
851 IsFinalized = true;
852}
853
854bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
855
857 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
858}
859
862 auto *GV =
863 new GlobalVariable(M, I32Ty,
864 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
865 ConstantInt::get(I32Ty, Value), Name);
866 GV->setVisibility(GlobalValue::HiddenVisibility);
867
868 return GV;
869}
870
872 if (List.empty())
873 return;
874
875 // Convert List to what ConstantArray needs.
877 UsedArray.resize(List.size());
878 for (unsigned I = 0, E = List.size(); I != E; ++I)
880 cast<Constant>(&*List[I]), Builder.getPtrTy());
881
882 if (UsedArray.empty())
883 return;
884 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
885
886 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
887 ConstantArray::get(ATy, UsedArray), Name);
888
889 GV->setSection("llvm.metadata");
890}
891
894 OMPTgtExecModeFlags Mode) {
895 auto *Int8Ty = Builder.getInt8Ty();
896 auto *GVMode = new GlobalVariable(
897 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
898 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
899 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
900 return GVMode;
901}
902
904 uint32_t SrcLocStrSize,
905 IdentFlag LocFlags,
906 unsigned Reserve2Flags) {
907 // Enable "C-mode".
908 LocFlags |= OMP_IDENT_FLAG_KMPC;
909
910 Constant *&Ident =
911 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
912 if (!Ident) {
914 Constant *IdentData[] = {I32Null,
915 ConstantInt::get(Int32, uint32_t(LocFlags)),
916 ConstantInt::get(Int32, Reserve2Flags),
917 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
918 Constant *Initializer =
919 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
920
921 // Look for existing encoding of the location + flags, not needed but
922 // minimizes the difference to the existing solution while we transition.
923 for (GlobalVariable &GV : M.globals())
924 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
925 if (GV.getInitializer() == Initializer)
926 Ident = &GV;
927
928 if (!Ident) {
929 auto *GV = new GlobalVariable(
930 M, OpenMPIRBuilder::Ident,
931 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
934 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
935 GV->setAlignment(Align(8));
936 Ident = GV;
937 }
938 }
939
941}
942
944 uint32_t &SrcLocStrSize) {
945 SrcLocStrSize = LocStr.size();
946 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
947 if (!SrcLocStr) {
948 Constant *Initializer =
950
951 // Look for existing encoding of the location, not needed but minimizes the
952 // difference to the existing solution while we transition.
953 for (GlobalVariable &GV : M.globals())
954 if (GV.isConstant() && GV.hasInitializer() &&
955 GV.getInitializer() == Initializer)
956 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
957
958 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
959 /* AddressSpace */ 0, &M);
960 }
961 return SrcLocStr;
962}
963
965 StringRef FileName,
966 unsigned Line, unsigned Column,
967 uint32_t &SrcLocStrSize) {
968 SmallString<128> Buffer;
969 Buffer.push_back(';');
970 Buffer.append(FileName);
971 Buffer.push_back(';');
972 Buffer.append(FunctionName);
973 Buffer.push_back(';');
974 Buffer.append(std::to_string(Line));
975 Buffer.push_back(';');
976 Buffer.append(std::to_string(Column));
977 Buffer.push_back(';');
978 Buffer.push_back(';');
979 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
980}
981
982Constant *
984 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
985 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
986}
987
989 uint32_t &SrcLocStrSize,
990 Function *F) {
991 DILocation *DIL = DL.get();
992 if (!DIL)
993 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
994 StringRef FileName = M.getName();
995 if (DIFile *DIF = DIL->getFile())
996 if (std::optional<StringRef> Source = DIF->getSource())
997 FileName = *Source;
998 StringRef Function = DIL->getScope()->getSubprogram()->getName();
999 if (Function.empty() && F)
1000 Function = F->getName();
1001 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1002 DIL->getColumn(), SrcLocStrSize);
1003}
1004
1006 uint32_t &SrcLocStrSize) {
1007 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1008 Loc.IP.getBlock()->getParent());
1009}
1010
1012 return Builder.CreateCall(
1013 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1014 "omp_global_thread_num");
1015}
1016
1019 bool ForceSimpleCall, bool CheckCancelFlag) {
1020 if (!updateToLocation(Loc))
1021 return Loc.IP;
1022
1023 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1024 // __kmpc_barrier(loc, thread_id);
1025
1026 IdentFlag BarrierLocFlags;
1027 switch (Kind) {
1028 case OMPD_for:
1029 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1030 break;
1031 case OMPD_sections:
1032 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1033 break;
1034 case OMPD_single:
1035 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1036 break;
1037 case OMPD_barrier:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1039 break;
1040 default:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1042 break;
1043 }
1044
1045 uint32_t SrcLocStrSize;
1046 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1047 Value *Args[] = {
1048 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1049 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1050
1051 // If we are in a cancellable parallel region, barriers are cancellation
1052 // points.
1053 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1054 bool UseCancelBarrier =
1055 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1056
1057 Value *Result =
1059 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1060 : OMPRTL___kmpc_barrier),
1061 Args);
1062
1063 if (UseCancelBarrier && CheckCancelFlag)
1064 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1065 return Err;
1066
1067 return Builder.saveIP();
1068}
1069
1072 Value *IfCondition,
1073 omp::Directive CanceledDirective) {
1074 if (!updateToLocation(Loc))
1075 return Loc.IP;
1076
1077 // LLVM utilities like blocks with terminators.
1078 auto *UI = Builder.CreateUnreachable();
1079
1080 Instruction *ThenTI = UI, *ElseTI = nullptr;
1081 if (IfCondition)
1082 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1083 Builder.SetInsertPoint(ThenTI);
1084
1085 Value *CancelKind = nullptr;
1086 switch (CanceledDirective) {
1087#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1088 case DirectiveEnum: \
1089 CancelKind = Builder.getInt32(Value); \
1090 break;
1091#include "llvm/Frontend/OpenMP/OMPKinds.def"
1092 default:
1093 llvm_unreachable("Unknown cancel kind!");
1094 }
1095
1096 uint32_t SrcLocStrSize;
1097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1099 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1100 Value *Result = Builder.CreateCall(
1101 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1102 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1103 if (CanceledDirective == OMPD_parallel) {
1105 Builder.restoreIP(IP);
1107 omp::Directive::OMPD_unknown,
1108 /* ForceSimpleCall */ false,
1109 /* CheckCancelFlag */ false)
1110 .takeError();
1111 }
1112 return Error::success();
1113 };
1114
1115 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1116 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1117 return Err;
1118
1119 // Update the insertion point and remove the terminator we introduced.
1120 Builder.SetInsertPoint(UI->getParent());
1121 UI->eraseFromParent();
1122
1123 return Builder.saveIP();
1124}
1125
1128 omp::Directive CanceledDirective) {
1129 if (!updateToLocation(Loc))
1130 return Loc.IP;
1131
1132 // LLVM utilities like blocks with terminators.
1133 auto *UI = Builder.CreateUnreachable();
1135
1136 Value *CancelKind = nullptr;
1137 switch (CanceledDirective) {
1138#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1139 case DirectiveEnum: \
1140 CancelKind = Builder.getInt32(Value); \
1141 break;
1142#include "llvm/Frontend/OpenMP/OMPKinds.def"
1143 default:
1144 llvm_unreachable("Unknown cancel kind!");
1145 }
1146
1147 uint32_t SrcLocStrSize;
1148 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1149 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1150 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1151 Value *Result = Builder.CreateCall(
1152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1153 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1154 if (CanceledDirective == OMPD_parallel) {
1156 Builder.restoreIP(IP);
1158 omp::Directive::OMPD_unknown,
1159 /* ForceSimpleCall */ false,
1160 /* CheckCancelFlag */ false)
1161 .takeError();
1162 }
1163 return Error::success();
1164 };
1165
1166 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1167 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1168 return Err;
1169
1170 // Update the insertion point and remove the terminator we introduced.
1171 Builder.SetInsertPoint(UI->getParent());
1172 UI->eraseFromParent();
1173
1174 return Builder.saveIP();
1175}
1176
1178 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1179 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1180 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1181 if (!updateToLocation(Loc))
1182 return Loc.IP;
1183
1184 Builder.restoreIP(AllocaIP);
1185 auto *KernelArgsPtr =
1186 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1187 updateToLocation(Loc);
1188
1189 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1190 llvm::Value *Arg =
1191 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1193 KernelArgs[I], Arg,
1194 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1195 }
1196
1197 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1198 NumThreads, HostPtr, KernelArgsPtr};
1199
1200 Return = Builder.CreateCall(
1201 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1202 OffloadingArgs);
1203
1204 return Builder.saveIP();
1205}
1206
1208 const LocationDescription &Loc, Value *OutlinedFnID,
1209 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1210 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1211
1212 if (!updateToLocation(Loc))
1213 return Loc.IP;
1214
1215 // On top of the arrays that were filled up, the target offloading call
1216 // takes as arguments the device id as well as the host pointer. The host
1217 // pointer is used by the runtime library to identify the current target
1218 // region, so it only has to be unique and not necessarily point to
1219 // anything. It could be the pointer to the outlined function that
1220 // implements the target region, but we aren't using that so that the
1221 // compiler doesn't need to keep that, and could therefore inline the host
1222 // function if proven worthwhile during optimization.
1223
1224 // From this point on, we need to have an ID of the target region defined.
1225 assert(OutlinedFnID && "Invalid outlined function ID!");
1226 (void)OutlinedFnID;
1227
1228 // Return value of the runtime offloading call.
1229 Value *Return = nullptr;
1230
1231 // Arguments for the target kernel.
1232 SmallVector<Value *> ArgsVector;
1233 getKernelArgsVector(Args, Builder, ArgsVector);
1234
1235 // The target region is an outlined function launched by the runtime
1236 // via calls to __tgt_target_kernel().
1237 //
1238 // Note that on the host and CPU targets, the runtime implementation of
1239 // these calls simply call the outlined function without forking threads.
1240 // The outlined functions themselves have runtime calls to
1241 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1242 // the compiler in emitTeamsCall() and emitParallelCall().
1243 //
1244 // In contrast, on the NVPTX target, the implementation of
1245 // __tgt_target_teams() launches a GPU kernel with the requested number
1246 // of teams and threads so no additional calls to the runtime are required.
1247 // Check the error code and execute the host version if required.
1249 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1250 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1251
1252 BasicBlock *OffloadFailedBlock =
1253 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1254 BasicBlock *OffloadContBlock =
1255 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1257 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1258
1259 auto CurFn = Builder.GetInsertBlock()->getParent();
1260 emitBlock(OffloadFailedBlock, CurFn);
1261 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1262 if (!AfterIP)
1263 return AfterIP.takeError();
1264 Builder.restoreIP(*AfterIP);
1265 emitBranch(OffloadContBlock);
1266 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1267 return Builder.saveIP();
1268}
1269
1271 Value *CancelFlag, omp::Directive CanceledDirective,
1272 FinalizeCallbackTy ExitCB) {
1273 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1274 "Unexpected cancellation!");
1275
1276 // For a cancel barrier we create two new blocks.
1278 BasicBlock *NonCancellationBlock;
1279 if (Builder.GetInsertPoint() == BB->end()) {
1280 // TODO: This branch will not be needed once we moved to the
1281 // OpenMPIRBuilder codegen completely.
1282 NonCancellationBlock = BasicBlock::Create(
1283 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1284 } else {
1285 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1288 }
1289 BasicBlock *CancellationBlock = BasicBlock::Create(
1290 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1291
1292 // Jump to them based on the return value.
1293 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1294 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1295 /* TODO weight */ nullptr, nullptr);
1296
1297 // From the cancellation block we finalize all variables and go to the
1298 // post finalization block that is known to the FiniCB callback.
1299 Builder.SetInsertPoint(CancellationBlock);
1300 if (ExitCB)
1301 if (Error Err = ExitCB(Builder.saveIP()))
1302 return Err;
1303 auto &FI = FinalizationStack.back();
1304 if (Error Err = FI.FiniCB(Builder.saveIP()))
1305 return Err;
1306
1307 // The continuation block is where code generation continues.
1308 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1309 return Error::success();
1310}
1311
1312// Callback used to create OpenMP runtime calls to support
1313// omp parallel clause for the device.
1314// We need to use this callback to replace call to the OutlinedFn in OuterFn
1315// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1317 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1318 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1319 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1320 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1321 // Add some known attributes.
1322 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1323 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1324 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1325 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1326 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1327 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1328
1329 assert(OutlinedFn.arg_size() >= 2 &&
1330 "Expected at least tid and bounded tid as arguments");
1331 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1332
1333 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1334 assert(CI && "Expected call instruction to outlined function");
1335 CI->getParent()->setName("omp_parallel");
1336
1337 Builder.SetInsertPoint(CI);
1338 Type *PtrTy = OMPIRBuilder->VoidPtr;
1339 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1340
1341 // Add alloca for kernel args
1342 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1343 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1344 AllocaInst *ArgsAlloca =
1345 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1346 Value *Args = ArgsAlloca;
1347 // Add address space cast if array for storing arguments is not allocated
1348 // in address space 0
1349 if (ArgsAlloca->getAddressSpace())
1350 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1351 Builder.restoreIP(CurrentIP);
1352
1353 // Store captured vars which are used by kmpc_parallel_51
1354 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1355 Value *V = *(CI->arg_begin() + 2 + Idx);
1356 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1357 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1358 Builder.CreateStore(V, StoreAddress);
1359 }
1360
1361 Value *Cond =
1362 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1363 : Builder.getInt32(1);
1364
1365 // Build kmpc_parallel_51 call
1366 Value *Parallel51CallArgs[] = {
1367 /* identifier*/ Ident,
1368 /* global thread num*/ ThreadID,
1369 /* if expression */ Cond,
1370 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1371 /* Proc bind */ Builder.getInt32(-1),
1372 /* outlined function */ &OutlinedFn,
1373 /* wrapper function */ NullPtrValue,
1374 /* arguments of the outlined funciton*/ Args,
1375 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1376
1377 FunctionCallee RTLFn =
1378 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1379
1380 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1381
1382 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1383 << *Builder.GetInsertBlock()->getParent() << "\n");
1384
1385 // Initialize the local TID stack location with the argument value.
1386 Builder.SetInsertPoint(PrivTID);
1387 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1388 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1389 PrivTIDAddr);
1390
1391 // Remove redundant call to the outlined function.
1392 CI->eraseFromParent();
1393
1394 for (Instruction *I : ToBeDeleted) {
1395 I->eraseFromParent();
1396 }
1397}
1398
1399// Callback used to create OpenMP runtime calls to support
1400// omp parallel clause for the host.
1401// We need to use this callback to replace call to the OutlinedFn in OuterFn
1402// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1403static void
1405 Function *OuterFn, Value *Ident, Value *IfCondition,
1406 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1407 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1408 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1409 FunctionCallee RTLFn;
1410 if (IfCondition) {
1411 RTLFn =
1412 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1413 } else {
1414 RTLFn =
1415 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1416 }
1417 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1418 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1419 LLVMContext &Ctx = F->getContext();
1420 MDBuilder MDB(Ctx);
1421 // Annotate the callback behavior of the __kmpc_fork_call:
1422 // - The callback callee is argument number 2 (microtask).
1423 // - The first two arguments of the callback callee are unknown (-1).
1424 // - All variadic arguments to the __kmpc_fork_call are passed to the
1425 // callback callee.
1426 F->addMetadata(LLVMContext::MD_callback,
1428 2, {-1, -1},
1429 /* VarArgsArePassed */ true)}));
1430 }
1431 }
1432 // Add some known attributes.
1433 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1434 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1435 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1436
1437 assert(OutlinedFn.arg_size() >= 2 &&
1438 "Expected at least tid and bounded tid as arguments");
1439 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1440
1441 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1442 CI->getParent()->setName("omp_parallel");
1443 Builder.SetInsertPoint(CI);
1444
1445 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1446 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1447 &OutlinedFn};
1448
1449 SmallVector<Value *, 16> RealArgs;
1450 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1451 if (IfCondition) {
1452 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1453 RealArgs.push_back(Cond);
1454 }
1455 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1456
1457 // __kmpc_fork_call_if always expects a void ptr as the last argument
1458 // If there are no arguments, pass a null pointer.
1459 auto PtrTy = OMPIRBuilder->VoidPtr;
1460 if (IfCondition && NumCapturedVars == 0) {
1461 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1462 RealArgs.push_back(NullPtrValue);
1463 }
1464
1465 Builder.CreateCall(RTLFn, RealArgs);
1466
1467 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1468 << *Builder.GetInsertBlock()->getParent() << "\n");
1469
1470 // Initialize the local TID stack location with the argument value.
1471 Builder.SetInsertPoint(PrivTID);
1472 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1473 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1474 PrivTIDAddr);
1475
1476 // Remove redundant call to the outlined function.
1477 CI->eraseFromParent();
1478
1479 for (Instruction *I : ToBeDeleted) {
1480 I->eraseFromParent();
1481 }
1482}
1483
1485 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1486 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1487 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1488 omp::ProcBindKind ProcBind, bool IsCancellable) {
1489 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1490
1491 if (!updateToLocation(Loc))
1492 return Loc.IP;
1493
1494 uint32_t SrcLocStrSize;
1495 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1496 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1497 Value *ThreadID = getOrCreateThreadID(Ident);
1498 // If we generate code for the target device, we need to allocate
1499 // struct for aggregate params in the device default alloca address space.
1500 // OpenMP runtime requires that the params of the extracted functions are
1501 // passed as zero address space pointers. This flag ensures that extracted
1502 // function arguments are declared in zero address space
1503 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1504
1505 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1506 // only if we compile for host side.
1507 if (NumThreads && !Config.isTargetDevice()) {
1508 Value *Args[] = {
1509 Ident, ThreadID,
1510 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1512 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1513 }
1514
1515 if (ProcBind != OMP_PROC_BIND_default) {
1516 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1522 }
1523
1524 BasicBlock *InsertBB = Builder.GetInsertBlock();
1525 Function *OuterFn = InsertBB->getParent();
1526
1527 // Save the outer alloca block because the insertion iterator may get
1528 // invalidated and we still need this later.
1529 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1530
1531 // Vector to remember instructions we used only during the modeling but which
1532 // we want to delete at the end.
1534
1535 // Change the location to the outer alloca insertion point to create and
1536 // initialize the allocas we pass into the parallel region.
1537 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1538 Builder.restoreIP(NewOuter);
1539 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1540 AllocaInst *ZeroAddrAlloca =
1541 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1542 Instruction *TIDAddr = TIDAddrAlloca;
1543 Instruction *ZeroAddr = ZeroAddrAlloca;
1544 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1545 // Add additional casts to enforce pointers in zero address space
1546 TIDAddr = new AddrSpaceCastInst(
1547 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1548 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1549 ToBeDeleted.push_back(TIDAddr);
1550 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1551 PointerType ::get(M.getContext(), 0),
1552 "zero.addr.ascast");
1553 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1554 ToBeDeleted.push_back(ZeroAddr);
1555 }
1556
1557 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1558 // associated arguments in the outlined function, so we delete them later.
1559 ToBeDeleted.push_back(TIDAddrAlloca);
1560 ToBeDeleted.push_back(ZeroAddrAlloca);
1561
1562 // Create an artificial insertion point that will also ensure the blocks we
1563 // are about to split are not degenerated.
1564 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1565
1566 BasicBlock *EntryBB = UI->getParent();
1567 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1568 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1569 BasicBlock *PRegPreFiniBB =
1570 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1571 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1572
1573 auto FiniCBWrapper = [&](InsertPointTy IP) {
1574 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1575 // target to the region exit block.
1576 if (IP.getBlock()->end() == IP.getPoint()) {
1578 Builder.restoreIP(IP);
1579 Instruction *I = Builder.CreateBr(PRegExitBB);
1580 IP = InsertPointTy(I->getParent(), I->getIterator());
1581 }
1582 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1583 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1584 "Unexpected insertion point for finalization call!");
1585 return FiniCB(IP);
1586 };
1587
1588 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1589
1590 // Generate the privatization allocas in the block that will become the entry
1591 // of the outlined function.
1592 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1593 InsertPointTy InnerAllocaIP = Builder.saveIP();
1594
1595 AllocaInst *PrivTIDAddr =
1596 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1597 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1598
1599 // Add some fake uses for OpenMP provided arguments.
1600 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1601 Instruction *ZeroAddrUse =
1602 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1603 ToBeDeleted.push_back(ZeroAddrUse);
1604
1605 // EntryBB
1606 // |
1607 // V
1608 // PRegionEntryBB <- Privatization allocas are placed here.
1609 // |
1610 // V
1611 // PRegionBodyBB <- BodeGen is invoked here.
1612 // |
1613 // V
1614 // PRegPreFiniBB <- The block we will start finalization from.
1615 // |
1616 // V
1617 // PRegionExitBB <- A common exit to simplify block collection.
1618 //
1619
1620 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1621
1622 // Let the caller create the body.
1623 assert(BodyGenCB && "Expected body generation callback!");
1624 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1625 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1626 return Err;
1627
1628 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1629
1630 OutlineInfo OI;
1631 if (Config.isTargetDevice()) {
1632 // Generate OpenMP target specific runtime call
1633 OI.PostOutlineCB = [=, ToBeDeletedVec =
1634 std::move(ToBeDeleted)](Function &OutlinedFn) {
1635 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1636 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1637 ThreadID, ToBeDeletedVec);
1638 };
1639 } else {
1640 // Generate OpenMP host runtime call
1641 OI.PostOutlineCB = [=, ToBeDeletedVec =
1642 std::move(ToBeDeleted)](Function &OutlinedFn) {
1643 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1644 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1645 };
1646 }
1647
1648 OI.OuterAllocaBB = OuterAllocaBlock;
1649 OI.EntryBB = PRegEntryBB;
1650 OI.ExitBB = PRegExitBB;
1651
1652 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1654 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1655
1656 CodeExtractorAnalysisCache CEAC(*OuterFn);
1657 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1658 /* AggregateArgs */ false,
1659 /* BlockFrequencyInfo */ nullptr,
1660 /* BranchProbabilityInfo */ nullptr,
1661 /* AssumptionCache */ nullptr,
1662 /* AllowVarArgs */ true,
1663 /* AllowAlloca */ true,
1664 /* AllocationBlock */ OuterAllocaBlock,
1665 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1666
1667 // Find inputs to, outputs from the code region.
1668 BasicBlock *CommonExit = nullptr;
1669 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1670 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1671
1672 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1673 /*CollectGlobalInputs=*/true);
1674
1675 Inputs.remove_if([&](Value *I) {
1676 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1677 return GV->getValueType() == OpenMPIRBuilder::Ident;
1678
1679 return false;
1680 });
1681
1682 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1683
1684 FunctionCallee TIDRTLFn =
1685 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1686
1687 auto PrivHelper = [&](Value &V) -> Error {
1688 if (&V == TIDAddr || &V == ZeroAddr) {
1689 OI.ExcludeArgsFromAggregate.push_back(&V);
1690 return Error::success();
1691 }
1692
1694 for (Use &U : V.uses())
1695 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1696 if (ParallelRegionBlockSet.count(UserI->getParent()))
1697 Uses.insert(&U);
1698
1699 // __kmpc_fork_call expects extra arguments as pointers. If the input
1700 // already has a pointer type, everything is fine. Otherwise, store the
1701 // value onto stack and load it back inside the to-be-outlined region. This
1702 // will ensure only the pointer will be passed to the function.
1703 // FIXME: if there are more than 15 trailing arguments, they must be
1704 // additionally packed in a struct.
1705 Value *Inner = &V;
1706 if (!V.getType()->isPointerTy()) {
1708 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1709
1710 Builder.restoreIP(OuterAllocaIP);
1711 Value *Ptr =
1712 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1713
1714 // Store to stack at end of the block that currently branches to the entry
1715 // block of the to-be-outlined region.
1716 Builder.SetInsertPoint(InsertBB,
1717 InsertBB->getTerminator()->getIterator());
1718 Builder.CreateStore(&V, Ptr);
1719
1720 // Load back next to allocations in the to-be-outlined region.
1721 Builder.restoreIP(InnerAllocaIP);
1722 Inner = Builder.CreateLoad(V.getType(), Ptr);
1723 }
1724
1725 Value *ReplacementValue = nullptr;
1726 CallInst *CI = dyn_cast<CallInst>(&V);
1727 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1728 ReplacementValue = PrivTID;
1729 } else {
1730 InsertPointOrErrorTy AfterIP =
1731 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1732 if (!AfterIP)
1733 return AfterIP.takeError();
1734 Builder.restoreIP(*AfterIP);
1735 InnerAllocaIP = {
1736 InnerAllocaIP.getBlock(),
1737 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1738
1739 assert(ReplacementValue &&
1740 "Expected copy/create callback to set replacement value!");
1741 if (ReplacementValue == &V)
1742 return Error::success();
1743 }
1744
1745 for (Use *UPtr : Uses)
1746 UPtr->set(ReplacementValue);
1747
1748 return Error::success();
1749 };
1750
1751 // Reset the inner alloca insertion as it will be used for loading the values
1752 // wrapped into pointers before passing them into the to-be-outlined region.
1753 // Configure it to insert immediately after the fake use of zero address so
1754 // that they are available in the generated body and so that the
1755 // OpenMP-related values (thread ID and zero address pointers) remain leading
1756 // in the argument list.
1757 InnerAllocaIP = IRBuilder<>::InsertPoint(
1758 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1759
1760 // Reset the outer alloca insertion point to the entry of the relevant block
1761 // in case it was invalidated.
1762 OuterAllocaIP = IRBuilder<>::InsertPoint(
1763 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1764
1765 for (Value *Input : Inputs) {
1766 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1767 if (Error Err = PrivHelper(*Input))
1768 return Err;
1769 }
1770 LLVM_DEBUG({
1771 for (Value *Output : Outputs)
1772 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1773 });
1774 assert(Outputs.empty() &&
1775 "OpenMP outlining should not produce live-out values!");
1776
1777 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1778 LLVM_DEBUG({
1779 for (auto *BB : Blocks)
1780 dbgs() << " PBR: " << BB->getName() << "\n";
1781 });
1782
1783 // Adjust the finalization stack, verify the adjustment, and call the
1784 // finalize function a last time to finalize values between the pre-fini
1785 // block and the exit block if we left the parallel "the normal way".
1786 auto FiniInfo = FinalizationStack.pop_back_val();
1787 (void)FiniInfo;
1788 assert(FiniInfo.DK == OMPD_parallel &&
1789 "Unexpected finalization stack state!");
1790
1791 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1792
1793 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1794 if (Error Err = FiniCB(PreFiniIP))
1795 return Err;
1796
1797 // Register the outlined info.
1798 addOutlineInfo(std::move(OI));
1799
1800 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1801 UI->eraseFromParent();
1802
1803 return AfterIP;
1804}
1805
1807 // Build call void __kmpc_flush(ident_t *loc)
1808 uint32_t SrcLocStrSize;
1809 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1810 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1811
1812 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1813}
1814
1816 if (!updateToLocation(Loc))
1817 return;
1818 emitFlush(Loc);
1819}
1820
1822 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1823 // global_tid);
1824 uint32_t SrcLocStrSize;
1825 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1826 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1827 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1828
1829 // Ignore return result until untied tasks are supported.
1830 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1831 Args);
1832}
1833
1835 if (!updateToLocation(Loc))
1836 return;
1837 emitTaskwaitImpl(Loc);
1838}
1839
1841 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1842 uint32_t SrcLocStrSize;
1843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1846 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1847
1848 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1849 Args);
1850}
1851
1853 if (!updateToLocation(Loc))
1854 return;
1855 emitTaskyieldImpl(Loc);
1856}
1857
1858// Processes the dependencies in Dependencies and does the following
1859// - Allocates space on the stack of an array of DependInfo objects
1860// - Populates each DependInfo object with relevant information of
1861// the corresponding dependence.
1862// - All code is inserted in the entry block of the current function.
1864 OpenMPIRBuilder &OMPBuilder,
1866 // Early return if we have no dependencies to process
1867 if (Dependencies.empty())
1868 return nullptr;
1869
1870 // Given a vector of DependData objects, in this function we create an
1871 // array on the stack that holds kmp_dep_info objects corresponding
1872 // to each dependency. This is then passed to the OpenMP runtime.
1873 // For example, if there are 'n' dependencies then the following psedo
1874 // code is generated. Assume the first dependence is on a variable 'a'
1875 //
1876 // \code{c}
1877 // DepArray = alloc(n x sizeof(kmp_depend_info);
1878 // idx = 0;
1879 // DepArray[idx].base_addr = ptrtoint(&a);
1880 // DepArray[idx].len = 8;
1881 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1882 // ++idx;
1883 // DepArray[idx].base_addr = ...;
1884 // \endcode
1885
1886 IRBuilderBase &Builder = OMPBuilder.Builder;
1887 Type *DependInfo = OMPBuilder.DependInfo;
1888 Module &M = OMPBuilder.M;
1889
1890 Value *DepArray = nullptr;
1891 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1892 Builder.SetInsertPoint(
1894
1895 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1896 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1897
1898 Builder.restoreIP(OldIP);
1899
1900 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1901 Value *Base =
1902 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1903 // Store the pointer to the variable
1904 Value *Addr = Builder.CreateStructGEP(
1905 DependInfo, Base,
1906 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1907 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1908 Builder.CreateStore(DepValPtr, Addr);
1909 // Store the size of the variable
1910 Value *Size = Builder.CreateStructGEP(
1911 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1912 Builder.CreateStore(
1913 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1914 Size);
1915 // Store the dependency kind
1916 Value *Flags = Builder.CreateStructGEP(
1917 DependInfo, Base,
1918 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1919 Builder.CreateStore(
1920 ConstantInt::get(Builder.getInt8Ty(),
1921 static_cast<unsigned int>(Dep.DepKind)),
1922 Flags);
1923 }
1924 return DepArray;
1925}
1926
1928 const LocationDescription &Loc, InsertPointTy AllocaIP,
1929 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1930 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1931 Value *Priority) {
1932
1933 if (!updateToLocation(Loc))
1934 return InsertPointTy();
1935
1936 uint32_t SrcLocStrSize;
1937 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1938 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1939 // The current basic block is split into four basic blocks. After outlining,
1940 // they will be mapped as follows:
1941 // ```
1942 // def current_fn() {
1943 // current_basic_block:
1944 // br label %task.exit
1945 // task.exit:
1946 // ; instructions after task
1947 // }
1948 // def outlined_fn() {
1949 // task.alloca:
1950 // br label %task.body
1951 // task.body:
1952 // ret void
1953 // }
1954 // ```
1955 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1956 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1957 BasicBlock *TaskAllocaBB =
1958 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1959
1960 InsertPointTy TaskAllocaIP =
1961 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1962 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1963 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1964 return Err;
1965
1966 OutlineInfo OI;
1967 OI.EntryBB = TaskAllocaBB;
1968 OI.OuterAllocaBB = AllocaIP.getBlock();
1969 OI.ExitBB = TaskExitBB;
1970
1971 // Add the thread ID argument.
1974 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1975
1976 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1977 Mergeable, Priority, EventHandle, TaskAllocaBB,
1978 ToBeDeleted](Function &OutlinedFn) mutable {
1979 // Replace the Stale CI by appropriate RTL function call.
1980 assert(OutlinedFn.hasOneUse() &&
1981 "there must be a single user for the outlined function");
1982 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1983
1984 // HasShareds is true if any variables are captured in the outlined region,
1985 // false otherwise.
1986 bool HasShareds = StaleCI->arg_size() > 1;
1987 Builder.SetInsertPoint(StaleCI);
1988
1989 // Gather the arguments for emitting the runtime call for
1990 // @__kmpc_omp_task_alloc
1991 Function *TaskAllocFn =
1992 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1993
1994 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1995 // call.
1996 Value *ThreadID = getOrCreateThreadID(Ident);
1997
1998 // Argument - `flags`
1999 // Task is tied iff (Flags & 1) == 1.
2000 // Task is untied iff (Flags & 1) == 0.
2001 // Task is final iff (Flags & 2) == 2.
2002 // Task is not final iff (Flags & 2) == 0.
2003 // Task is mergeable iff (Flags & 4) == 4.
2004 // Task is not mergeable iff (Flags & 4) == 0.
2005 // Task is priority iff (Flags & 32) == 32.
2006 // Task is not priority iff (Flags & 32) == 0.
2007 // TODO: Handle the other flags.
2008 Value *Flags = Builder.getInt32(Tied);
2009 if (Final) {
2010 Value *FinalFlag =
2012 Flags = Builder.CreateOr(FinalFlag, Flags);
2013 }
2014
2015 if (Mergeable)
2016 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2017 if (Priority)
2018 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2019
2020 // Argument - `sizeof_kmp_task_t` (TaskSize)
2021 // Tasksize refers to the size in bytes of kmp_task_t data structure
2022 // including private vars accessed in task.
2023 // TODO: add kmp_task_t_with_privates (privates)
2024 Value *TaskSize = Builder.getInt64(
2026
2027 // Argument - `sizeof_shareds` (SharedsSize)
2028 // SharedsSize refers to the shareds array size in the kmp_task_t data
2029 // structure.
2030 Value *SharedsSize = Builder.getInt64(0);
2031 if (HasShareds) {
2032 AllocaInst *ArgStructAlloca =
2033 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
2034 assert(ArgStructAlloca &&
2035 "Unable to find the alloca instruction corresponding to arguments "
2036 "for extracted function");
2037 StructType *ArgStructType =
2038 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2039 assert(ArgStructType && "Unable to find struct type corresponding to "
2040 "arguments for extracted function");
2041 SharedsSize =
2043 }
2044 // Emit the @__kmpc_omp_task_alloc runtime call
2045 // The runtime call returns a pointer to an area where the task captured
2046 // variables must be copied before the task is run (TaskData)
2047 CallInst *TaskData = Builder.CreateCall(
2048 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2049 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2050 /*task_func=*/&OutlinedFn});
2051
2052 // Emit detach clause initialization.
2053 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2054 // task_descriptor);
2055 if (EventHandle) {
2057 OMPRTL___kmpc_task_allow_completion_event);
2058 llvm::Value *EventVal =
2059 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2060 llvm::Value *EventHandleAddr =
2062 Builder.getPtrTy(0));
2063 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2064 Builder.CreateStore(EventVal, EventHandleAddr);
2065 }
2066 // Copy the arguments for outlined function
2067 if (HasShareds) {
2068 Value *Shareds = StaleCI->getArgOperand(1);
2069 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2070 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2071 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2072 SharedsSize);
2073 }
2074
2075 if (Priority) {
2076 //
2077 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2078 // we populate the priority information into the "kmp_task_t" here
2079 //
2080 // The struct "kmp_task_t" definition is available in kmp.h
2081 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2082 // data2 is used for priority
2083 //
2084 Type *Int32Ty = Builder.getInt32Ty();
2085 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2086 // kmp_task_t* => { ptr }
2087 Type *TaskPtr = StructType::get(VoidPtr);
2088 Value *TaskGEP =
2089 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2090 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2091 Type *TaskStructType = StructType::get(
2092 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2093 Value *PriorityData = Builder.CreateInBoundsGEP(
2094 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2095 // kmp_cmplrdata_t => { ptr, ptr }
2096 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2097 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2098 PriorityData, {Zero, Zero});
2099 Builder.CreateStore(Priority, CmplrData);
2100 }
2101
2102 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2103
2104 // In the presence of the `if` clause, the following IR is generated:
2105 // ...
2106 // %data = call @__kmpc_omp_task_alloc(...)
2107 // br i1 %if_condition, label %then, label %else
2108 // then:
2109 // call @__kmpc_omp_task(...)
2110 // br label %exit
2111 // else:
2112 // ;; Wait for resolution of dependencies, if any, before
2113 // ;; beginning the task
2114 // call @__kmpc_omp_wait_deps(...)
2115 // call @__kmpc_omp_task_begin_if0(...)
2116 // call @outlined_fn(...)
2117 // call @__kmpc_omp_task_complete_if0(...)
2118 // br label %exit
2119 // exit:
2120 // ...
2121 if (IfCondition) {
2122 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2123 // terminator.
2124 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2125 Instruction *IfTerminator =
2126 Builder.GetInsertPoint()->getParent()->getTerminator();
2127 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2128 Builder.SetInsertPoint(IfTerminator);
2129 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2130 &ElseTI);
2131 Builder.SetInsertPoint(ElseTI);
2132
2133 if (Dependencies.size()) {
2134 Function *TaskWaitFn =
2135 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2137 TaskWaitFn,
2138 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2139 ConstantInt::get(Builder.getInt32Ty(), 0),
2141 }
2142 Function *TaskBeginFn =
2143 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2144 Function *TaskCompleteFn =
2145 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2146 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2147 CallInst *CI = nullptr;
2148 if (HasShareds)
2149 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2150 else
2151 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2152 CI->setDebugLoc(StaleCI->getDebugLoc());
2153 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2154 Builder.SetInsertPoint(ThenTI);
2155 }
2156
2157 if (Dependencies.size()) {
2158 Function *TaskFn =
2159 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2161 TaskFn,
2162 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2163 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2165
2166 } else {
2167 // Emit the @__kmpc_omp_task runtime call to spawn the task
2168 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2169 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2170 }
2171
2172 StaleCI->eraseFromParent();
2173
2174 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2175 if (HasShareds) {
2176 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2177 OutlinedFn.getArg(1)->replaceUsesWithIf(
2178 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2179 }
2180
2181 for (Instruction *I : llvm::reverse(ToBeDeleted))
2182 I->eraseFromParent();
2183 };
2184
2185 addOutlineInfo(std::move(OI));
2186 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2187
2188 return Builder.saveIP();
2189}
2190
2193 InsertPointTy AllocaIP,
2194 BodyGenCallbackTy BodyGenCB) {
2195 if (!updateToLocation(Loc))
2196 return InsertPointTy();
2197
2198 uint32_t SrcLocStrSize;
2199 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2200 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2201 Value *ThreadID = getOrCreateThreadID(Ident);
2202
2203 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2204 Function *TaskgroupFn =
2205 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2206 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2207
2208 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2209 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2210 return Err;
2211
2212 Builder.SetInsertPoint(TaskgroupExitBB);
2213 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2214 Function *EndTaskgroupFn =
2215 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2216 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2217
2218 return Builder.saveIP();
2219}
2220
2222 const LocationDescription &Loc, InsertPointTy AllocaIP,
2224 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2225 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2226
2227 if (!updateToLocation(Loc))
2228 return Loc.IP;
2229
2230 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2231 // this has not been created yet at some times when this callback runs.
2232 SmallVector<BranchInst *> CancellationBranches;
2233 auto FiniCBWrapper = [&](InsertPointTy IP) {
2234 if (IP.getBlock()->end() != IP.getPoint())
2235 return FiniCB(IP);
2236 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2237 // will fail because that function requires the Finalization Basic Block to
2238 // have a terminator, which is already removed by EmitOMPRegionBody.
2239 // IP is currently at cancelation block.
2240 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2241 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2242 CancellationBranches.push_back(DummyBranch);
2243 return FiniCB(IP);
2244 };
2245
2246 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2247
2248 // Each section is emitted as a switch case
2249 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2250 // -> OMP.createSection() which generates the IR for each section
2251 // Iterate through all sections and emit a switch construct:
2252 // switch (IV) {
2253 // case 0:
2254 // <SectionStmt[0]>;
2255 // break;
2256 // ...
2257 // case <NumSection> - 1:
2258 // <SectionStmt[<NumSection> - 1]>;
2259 // break;
2260 // }
2261 // ...
2262 // section_loop.after:
2263 // <FiniCB>;
2264 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2265 Builder.restoreIP(CodeGenIP);
2267 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2268 Function *CurFn = Continue->getParent();
2269 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2270
2271 unsigned CaseNumber = 0;
2272 for (auto SectionCB : SectionCBs) {
2274 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2275 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2276 Builder.SetInsertPoint(CaseBB);
2277 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2278 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2279 CaseEndBr->getIterator()}))
2280 return Err;
2281 CaseNumber++;
2282 }
2283 // remove the existing terminator from body BB since there can be no
2284 // terminators after switch/case
2285 return Error::success();
2286 };
2287 // Loop body ends here
2288 // LowerBound, UpperBound, and STride for createCanonicalLoop
2289 Type *I32Ty = Type::getInt32Ty(M.getContext());
2290 Value *LB = ConstantInt::get(I32Ty, 0);
2291 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2292 Value *ST = ConstantInt::get(I32Ty, 1);
2294 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2295 if (!LoopInfo)
2296 return LoopInfo.takeError();
2297
2298 InsertPointOrErrorTy WsloopIP =
2299 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2300 WorksharingLoopType::ForStaticLoop, !IsNowait);
2301 if (!WsloopIP)
2302 return WsloopIP.takeError();
2303 InsertPointTy AfterIP = *WsloopIP;
2304
2305 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2306 assert(LoopFini && "Bad structure of static workshare loop finalization");
2307
2308 // Apply the finalization callback in LoopAfterBB
2309 auto FiniInfo = FinalizationStack.pop_back_val();
2310 assert(FiniInfo.DK == OMPD_sections &&
2311 "Unexpected finalization stack state!");
2312 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2313 Builder.restoreIP(AfterIP);
2314 BasicBlock *FiniBB =
2315 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2316 if (Error Err = CB(Builder.saveIP()))
2317 return Err;
2318 AfterIP = {FiniBB, FiniBB->begin()};
2319 }
2320
2321 // Now we can fix the dummy branch to point to the right place
2322 for (BranchInst *DummyBranch : CancellationBranches) {
2323 assert(DummyBranch->getNumSuccessors() == 1);
2324 DummyBranch->setSuccessor(0, LoopFini);
2325 }
2326
2327 return AfterIP;
2328}
2329
2332 BodyGenCallbackTy BodyGenCB,
2333 FinalizeCallbackTy FiniCB) {
2334 if (!updateToLocation(Loc))
2335 return Loc.IP;
2336
2337 auto FiniCBWrapper = [&](InsertPointTy IP) {
2338 if (IP.getBlock()->end() != IP.getPoint())
2339 return FiniCB(IP);
2340 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2341 // will fail because that function requires the Finalization Basic Block to
2342 // have a terminator, which is already removed by EmitOMPRegionBody.
2343 // IP is currently at cancelation block.
2344 // We need to backtrack to the condition block to fetch
2345 // the exit block and create a branch from cancelation
2346 // to exit block.
2348 Builder.restoreIP(IP);
2349 auto *CaseBB = Loc.IP.getBlock();
2350 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2351 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2352 Instruction *I = Builder.CreateBr(ExitBB);
2353 IP = InsertPointTy(I->getParent(), I->getIterator());
2354 return FiniCB(IP);
2355 };
2356
2357 Directive OMPD = Directive::OMPD_sections;
2358 // Since we are using Finalization Callback here, HasFinalize
2359 // and IsCancellable have to be true
2360 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2361 /*Conditional*/ false, /*hasFinalize*/ true,
2362 /*IsCancellable*/ true);
2363}
2364
2367 IT++;
2368 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2369}
2370
2371Value *OpenMPIRBuilder::getGPUThreadID() {
2372 return Builder.CreateCall(
2374 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2375 {});
2376}
2377
2378Value *OpenMPIRBuilder::getGPUWarpSize() {
2379 return Builder.CreateCall(
2380 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2381}
2382
2383Value *OpenMPIRBuilder::getNVPTXWarpID() {
2384 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2385 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2386}
2387
2388Value *OpenMPIRBuilder::getNVPTXLaneID() {
2389 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2390 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2391 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2392 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2393 "nvptx_lane_id");
2394}
2395
2396Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2397 Type *ToType) {
2398 Type *FromType = From->getType();
2399 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2400 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2401 assert(FromSize > 0 && "From size must be greater than zero");
2402 assert(ToSize > 0 && "To size must be greater than zero");
2403 if (FromType == ToType)
2404 return From;
2405 if (FromSize == ToSize)
2406 return Builder.CreateBitCast(From, ToType);
2407 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2408 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2409 InsertPointTy SaveIP = Builder.saveIP();
2410 Builder.restoreIP(AllocaIP);
2411 Value *CastItem = Builder.CreateAlloca(ToType);
2412 Builder.restoreIP(SaveIP);
2413
2415 CastItem, Builder.getPtrTy(0));
2416 Builder.CreateStore(From, ValCastItem);
2417 return Builder.CreateLoad(ToType, CastItem);
2418}
2419
2420Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2421 Value *Element,
2422 Type *ElementType,
2423 Value *Offset) {
2424 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2425 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2426
2427 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2428 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2429 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2430 Value *WarpSize =
2431 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2433 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2434 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2435 Value *WarpSizeCast =
2436 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2437 Value *ShuffleCall =
2438 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2439 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2440}
2441
2442void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2443 Value *DstAddr, Type *ElemType,
2444 Value *Offset, Type *ReductionArrayTy) {
2446 // Create the loop over the big sized data.
2447 // ptr = (void*)Elem;
2448 // ptrEnd = (void*) Elem + 1;
2449 // Step = 8;
2450 // while (ptr + Step < ptrEnd)
2451 // shuffle((int64_t)*ptr);
2452 // Step = 4;
2453 // while (ptr + Step < ptrEnd)
2454 // shuffle((int32_t)*ptr);
2455 // ...
2456 Type *IndexTy = Builder.getIndexTy(
2458 Value *ElemPtr = DstAddr;
2459 Value *Ptr = SrcAddr;
2460 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2461 if (Size < IntSize)
2462 continue;
2463 Type *IntType = Builder.getIntNTy(IntSize * 8);
2465 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2466 Value *SrcAddrGEP =
2467 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2469 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2470
2471 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2472 if ((Size / IntSize) > 1) {
2474 SrcAddrGEP, Builder.getPtrTy());
2475 BasicBlock *PreCondBB =
2476 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2477 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2478 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2479 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2480 emitBlock(PreCondBB, CurFunc);
2481 PHINode *PhiSrc =
2482 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2483 PhiSrc->addIncoming(Ptr, CurrentBB);
2484 PHINode *PhiDest =
2485 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2486 PhiDest->addIncoming(ElemPtr, CurrentBB);
2487 Ptr = PhiSrc;
2488 ElemPtr = PhiDest;
2489 Value *PtrDiff = Builder.CreatePtrDiff(
2490 Builder.getInt8Ty(), PtrEnd,
2493 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2494 ExitBB);
2495 emitBlock(ThenBB, CurFunc);
2496 Value *Res = createRuntimeShuffleFunction(
2497 AllocaIP,
2499 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2500 IntType, Offset);
2501 Builder.CreateAlignedStore(Res, ElemPtr,
2502 M.getDataLayout().getPrefTypeAlign(ElemType));
2503 Value *LocalPtr =
2504 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2505 Value *LocalElemPtr =
2506 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2507 PhiSrc->addIncoming(LocalPtr, ThenBB);
2508 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2509 emitBranch(PreCondBB);
2510 emitBlock(ExitBB, CurFunc);
2511 } else {
2512 Value *Res = createRuntimeShuffleFunction(
2513 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2514 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2515 Res->getType()->getScalarSizeInBits())
2516 Res = Builder.CreateTrunc(Res, ElemType);
2517 Builder.CreateStore(Res, ElemPtr);
2518 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2519 ElemPtr =
2520 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2521 }
2522 Size = Size % IntSize;
2523 }
2524}
2525
2526void OpenMPIRBuilder::emitReductionListCopy(
2527 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2528 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2529 CopyOptionsTy CopyOptions) {
2530 Type *IndexTy = Builder.getIndexTy(
2532 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2533
2534 // Iterates, element-by-element, through the source Reduce list and
2535 // make a copy.
2536 for (auto En : enumerate(ReductionInfos)) {
2537 const ReductionInfo &RI = En.value();
2538 Value *SrcElementAddr = nullptr;
2539 Value *DestElementAddr = nullptr;
2540 Value *DestElementPtrAddr = nullptr;
2541 // Should we shuffle in an element from a remote lane?
2542 bool ShuffleInElement = false;
2543 // Set to true to update the pointer in the dest Reduce list to a
2544 // newly created element.
2545 bool UpdateDestListPtr = false;
2546
2547 // Step 1.1: Get the address for the src element in the Reduce list.
2548 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2549 ReductionArrayTy, SrcBase,
2550 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2551 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2552
2553 // Step 1.2: Create a temporary to store the element in the destination
2554 // Reduce list.
2555 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2556 ReductionArrayTy, DestBase,
2557 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2558 switch (Action) {
2560 InsertPointTy CurIP = Builder.saveIP();
2561 Builder.restoreIP(AllocaIP);
2562 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2563 ".omp.reduction.element");
2564 DestAlloca->setAlignment(
2565 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2566 DestElementAddr = DestAlloca;
2567 DestElementAddr =
2568 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2569 DestElementAddr->getName() + ".ascast");
2570 Builder.restoreIP(CurIP);
2571 ShuffleInElement = true;
2572 UpdateDestListPtr = true;
2573 break;
2574 }
2576 DestElementAddr =
2577 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2578 break;
2579 }
2580 }
2581
2582 // Now that all active lanes have read the element in the
2583 // Reduce list, shuffle over the value from the remote lane.
2584 if (ShuffleInElement) {
2585 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2586 RemoteLaneOffset, ReductionArrayTy);
2587 } else {
2588 switch (RI.EvaluationKind) {
2589 case EvalKind::Scalar: {
2590 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2591 // Store the source element value to the dest element address.
2592 Builder.CreateStore(Elem, DestElementAddr);
2593 break;
2594 }
2595 case EvalKind::Complex: {
2597 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2598 Value *SrcReal = Builder.CreateLoad(
2599 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2601 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2602 Value *SrcImg = Builder.CreateLoad(
2603 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2604
2606 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2608 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2609 Builder.CreateStore(SrcReal, DestRealPtr);
2610 Builder.CreateStore(SrcImg, DestImgPtr);
2611 break;
2612 }
2613 case EvalKind::Aggregate: {
2614 Value *SizeVal = Builder.getInt64(
2615 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2617 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2618 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2619 SizeVal, false);
2620 break;
2621 }
2622 };
2623 }
2624
2625 // Step 3.1: Modify reference in dest Reduce list as needed.
2626 // Modifying the reference in Reduce list to point to the newly
2627 // created element. The element is live in the current function
2628 // scope and that of functions it invokes (i.e., reduce_function).
2629 // RemoteReduceData[i] = (void*)&RemoteElem
2630 if (UpdateDestListPtr) {
2632 DestElementAddr, Builder.getPtrTy(),
2633 DestElementAddr->getName() + ".ascast");
2634 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2635 }
2636 }
2637}
2638
2639Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2640 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2641 AttributeList FuncAttrs) {
2642 InsertPointTy SavedIP = Builder.saveIP();
2643 LLVMContext &Ctx = M.getContext();
2645 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2646 /* IsVarArg */ false);
2647 Function *WcFunc =
2649 "_omp_reduction_inter_warp_copy_func", &M);
2650 WcFunc->setAttributes(FuncAttrs);
2651 WcFunc->addParamAttr(0, Attribute::NoUndef);
2652 WcFunc->addParamAttr(1, Attribute::NoUndef);
2653 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2654 Builder.SetInsertPoint(EntryBB);
2655
2656 // ReduceList: thread local Reduce list.
2657 // At the stage of the computation when this function is called, partially
2658 // aggregated values reside in the first lane of every active warp.
2659 Argument *ReduceListArg = WcFunc->getArg(0);
2660 // NumWarps: number of warps active in the parallel region. This could
2661 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2662 Argument *NumWarpsArg = WcFunc->getArg(1);
2663
2664 // This array is used as a medium to transfer, one reduce element at a time,
2665 // the data from the first lane of every warp to lanes in the first warp
2666 // in order to perform the final step of a reduction in a parallel region
2667 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2668 // for reduced latency, as well as to have a distinct copy for concurrently
2669 // executing target regions. The array is declared with common linkage so
2670 // as to be shared across compilation units.
2671 StringRef TransferMediumName =
2672 "__openmp_nvptx_data_transfer_temporary_storage";
2673 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2674 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2675 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2676 if (!TransferMedium) {
2677 TransferMedium = new GlobalVariable(
2678 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2679 UndefValue::get(ArrayTy), TransferMediumName,
2680 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2681 /*AddressSpace=*/3);
2682 }
2683
2684 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2685 Value *GPUThreadID = getGPUThreadID();
2686 // nvptx_lane_id = nvptx_id % warpsize
2687 Value *LaneID = getNVPTXLaneID();
2688 // nvptx_warp_id = nvptx_id / warpsize
2689 Value *WarpID = getNVPTXWarpID();
2690
2691 InsertPointTy AllocaIP =
2694 Type *Arg0Type = ReduceListArg->getType();
2695 Type *Arg1Type = NumWarpsArg->getType();
2696 Builder.restoreIP(AllocaIP);
2697 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2698 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2699 AllocaInst *NumWarpsAlloca =
2700 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2702 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2704 NumWarpsAlloca, Builder.getPtrTy(0),
2705 NumWarpsAlloca->getName() + ".ascast");
2706 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2707 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2708 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2709 InsertPointTy CodeGenIP =
2711 Builder.restoreIP(CodeGenIP);
2712
2713 Value *ReduceList =
2714 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2715
2716 for (auto En : enumerate(ReductionInfos)) {
2717 //
2718 // Warp master copies reduce element to transfer medium in __shared__
2719 // memory.
2720 //
2721 const ReductionInfo &RI = En.value();
2722 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2723 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2724 Type *CType = Builder.getIntNTy(TySize * 8);
2725
2726 unsigned NumIters = RealTySize / TySize;
2727 if (NumIters == 0)
2728 continue;
2729 Value *Cnt = nullptr;
2730 Value *CntAddr = nullptr;
2731 BasicBlock *PrecondBB = nullptr;
2732 BasicBlock *ExitBB = nullptr;
2733 if (NumIters > 1) {
2734 CodeGenIP = Builder.saveIP();
2735 Builder.restoreIP(AllocaIP);
2736 CntAddr =
2737 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2738
2739 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2740 CntAddr->getName() + ".ascast");
2741 Builder.restoreIP(CodeGenIP);
2743 CntAddr,
2744 /*Volatile=*/false);
2745 PrecondBB = BasicBlock::Create(Ctx, "precond");
2746 ExitBB = BasicBlock::Create(Ctx, "exit");
2747 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2748 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2749 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2750 /*Volatile=*/false);
2752 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2753 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2755 }
2756
2757 // kmpc_barrier.
2758 InsertPointOrErrorTy BarrierIP1 =
2759 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2760 omp::Directive::OMPD_unknown,
2761 /* ForceSimpleCall */ false,
2762 /* CheckCancelFlag */ true);
2763 if (!BarrierIP1)
2764 return BarrierIP1.takeError();
2765 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2766 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2767 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2768
2769 // if (lane_id == 0)
2770 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2771 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2773
2774 // Reduce element = LocalReduceList[i]
2775 auto *RedListArrayTy =
2776 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2777 Type *IndexTy = Builder.getIndexTy(
2779 Value *ElemPtrPtr =
2780 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2781 {ConstantInt::get(IndexTy, 0),
2782 ConstantInt::get(IndexTy, En.index())});
2783 // elemptr = ((CopyType*)(elemptrptr)) + I
2784 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2785 if (NumIters > 1)
2786 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2787
2788 // Get pointer to location in transfer medium.
2789 // MediumPtr = &medium[warp_id]
2790 Value *MediumPtr = Builder.CreateInBoundsGEP(
2791 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2792 // elem = *elemptr
2793 //*MediumPtr = elem
2794 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2795 // Store the source element value to the dest element address.
2796 Builder.CreateStore(Elem, MediumPtr,
2797 /*IsVolatile*/ true);
2798 Builder.CreateBr(MergeBB);
2799
2800 // else
2802 Builder.CreateBr(MergeBB);
2803
2804 // endif
2806 InsertPointOrErrorTy BarrierIP2 =
2807 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2808 omp::Directive::OMPD_unknown,
2809 /* ForceSimpleCall */ false,
2810 /* CheckCancelFlag */ true);
2811 if (!BarrierIP2)
2812 return BarrierIP2.takeError();
2813
2814 // Warp 0 copies reduce element from transfer medium
2815 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2816 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2817 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2818
2819 Value *NumWarpsVal =
2820 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2821 // Up to 32 threads in warp 0 are active.
2822 Value *IsActiveThread =
2823 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2824 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2825
2826 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2827
2828 // SecMediumPtr = &medium[tid]
2829 // SrcMediumVal = *SrcMediumPtr
2830 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2831 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2832 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2833 Value *TargetElemPtrPtr =
2834 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2835 {ConstantInt::get(IndexTy, 0),
2836 ConstantInt::get(IndexTy, En.index())});
2837 Value *TargetElemPtrVal =
2838 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2839 Value *TargetElemPtr = TargetElemPtrVal;
2840 if (NumIters > 1)
2841 TargetElemPtr =
2842 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2843
2844 // *TargetElemPtr = SrcMediumVal;
2845 Value *SrcMediumValue =
2846 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2847 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2848 Builder.CreateBr(W0MergeBB);
2849
2850 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2851 Builder.CreateBr(W0MergeBB);
2852
2853 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2854
2855 if (NumIters > 1) {
2856 Cnt = Builder.CreateNSWAdd(
2857 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2858 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2859
2860 auto *CurFn = Builder.GetInsertBlock()->getParent();
2861 emitBranch(PrecondBB);
2862 emitBlock(ExitBB, CurFn);
2863 }
2864 RealTySize %= TySize;
2865 }
2866 }
2867
2869 Builder.restoreIP(SavedIP);
2870
2871 return WcFunc;
2872}
2873
2874Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2875 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2876 AttributeList FuncAttrs) {
2877 LLVMContext &Ctx = M.getContext();
2878 FunctionType *FuncTy =
2880 {Builder.getPtrTy(), Builder.getInt16Ty(),
2881 Builder.getInt16Ty(), Builder.getInt16Ty()},
2882 /* IsVarArg */ false);
2883 Function *SarFunc =
2885 "_omp_reduction_shuffle_and_reduce_func", &M);
2886 SarFunc->setAttributes(FuncAttrs);
2887 SarFunc->addParamAttr(0, Attribute::NoUndef);
2888 SarFunc->addParamAttr(1, Attribute::NoUndef);
2889 SarFunc->addParamAttr(2, Attribute::NoUndef);
2890 SarFunc->addParamAttr(3, Attribute::NoUndef);
2891 SarFunc->addParamAttr(1, Attribute::SExt);
2892 SarFunc->addParamAttr(2, Attribute::SExt);
2893 SarFunc->addParamAttr(3, Attribute::SExt);
2894 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2895 Builder.SetInsertPoint(EntryBB);
2896
2897 // Thread local Reduce list used to host the values of data to be reduced.
2898 Argument *ReduceListArg = SarFunc->getArg(0);
2899 // Current lane id; could be logical.
2900 Argument *LaneIDArg = SarFunc->getArg(1);
2901 // Offset of the remote source lane relative to the current lane.
2902 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2903 // Algorithm version. This is expected to be known at compile time.
2904 Argument *AlgoVerArg = SarFunc->getArg(3);
2905
2906 Type *ReduceListArgType = ReduceListArg->getType();
2907 Type *LaneIDArgType = LaneIDArg->getType();
2908 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2909 Value *ReduceListAlloca = Builder.CreateAlloca(
2910 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2911 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2912 LaneIDArg->getName() + ".addr");
2913 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2914 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2915 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2916 AlgoVerArg->getName() + ".addr");
2917 ArrayType *RedListArrayTy =
2918 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2919
2920 // Create a local thread-private variable to host the Reduce list
2921 // from a remote lane.
2922 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2923 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2924
2926 ReduceListAlloca, ReduceListArgType,
2927 ReduceListAlloca->getName() + ".ascast");
2929 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2930 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2931 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2932 RemoteLaneOffsetAlloca->getName() + ".ascast");
2934 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2936 RemoteReductionListAlloca, Builder.getPtrTy(),
2937 RemoteReductionListAlloca->getName() + ".ascast");
2938
2939 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2940 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2941 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2942 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2943
2944 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2945 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2946 Value *RemoteLaneOffset =
2947 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2948 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2949
2950 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2951
2952 // This loop iterates through the list of reduce elements and copies,
2953 // element by element, from a remote lane in the warp to RemoteReduceList,
2954 // hosted on the thread's stack.
2955 emitReductionListCopy(
2956 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2957 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2958
2959 // The actions to be performed on the Remote Reduce list is dependent
2960 // on the algorithm version.
2961 //
2962 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2963 // LaneId % 2 == 0 && Offset > 0):
2964 // do the reduction value aggregation
2965 //
2966 // The thread local variable Reduce list is mutated in place to host the
2967 // reduced data, which is the aggregated value produced from local and
2968 // remote lanes.
2969 //
2970 // Note that AlgoVer is expected to be a constant integer known at compile
2971 // time.
2972 // When AlgoVer==0, the first conjunction evaluates to true, making
2973 // the entire predicate true during compile time.
2974 // When AlgoVer==1, the second conjunction has only the second part to be
2975 // evaluated during runtime. Other conjunctions evaluates to false
2976 // during compile time.
2977 // When AlgoVer==2, the third conjunction has only the second part to be
2978 // evaluated during runtime. Other conjunctions evaluates to false
2979 // during compile time.
2980 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2981 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2982 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2983 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2984 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2985 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2986 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2987 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2988 Value *RemoteOffsetComp =
2989 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2990 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2991 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2992 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2993
2994 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2995 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2996 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2997
2998 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3001 ReduceList, Builder.getPtrTy());
3002 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3003 RemoteListAddrCast, Builder.getPtrTy());
3004 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3005 ->addFnAttr(Attribute::NoUnwind);
3006 Builder.CreateBr(MergeBB);
3007
3009 Builder.CreateBr(MergeBB);
3010
3012
3013 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3014 // Reduce list.
3015 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3016 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3017 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3018
3019 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3020 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3021 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3022 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3023
3024 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3025 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3026 ReductionInfos, RemoteListAddrCast, ReduceList);
3027 Builder.CreateBr(CpyMergeBB);
3028
3029 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3030 Builder.CreateBr(CpyMergeBB);
3031
3032 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3033
3035
3036 return SarFunc;
3037}
3038
3039Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3040 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3041 AttributeList FuncAttrs) {
3043 LLVMContext &Ctx = M.getContext();
3046 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3047 /* IsVarArg */ false);
3048 Function *LtGCFunc =
3050 "_omp_reduction_list_to_global_copy_func", &M);
3051 LtGCFunc->setAttributes(FuncAttrs);
3052 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3053 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3054 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3055
3056 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3057 Builder.SetInsertPoint(EntryBlock);
3058
3059 // Buffer: global reduction buffer.
3060 Argument *BufferArg = LtGCFunc->getArg(0);
3061 // Idx: index of the buffer.
3062 Argument *IdxArg = LtGCFunc->getArg(1);
3063 // ReduceList: thread local Reduce list.
3064 Argument *ReduceListArg = LtGCFunc->getArg(2);
3065
3066 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3067 BufferArg->getName() + ".addr");
3068 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3069 IdxArg->getName() + ".addr");
3070 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3071 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3073 BufferArgAlloca, Builder.getPtrTy(),
3074 BufferArgAlloca->getName() + ".ascast");
3076 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3077 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3078 ReduceListArgAlloca, Builder.getPtrTy(),
3079 ReduceListArgAlloca->getName() + ".ascast");
3080
3081 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3082 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3083 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3084
3085 Value *LocalReduceList =
3086 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3087 Value *BufferArgVal =
3088 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3089 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3090 Type *IndexTy = Builder.getIndexTy(
3092 for (auto En : enumerate(ReductionInfos)) {
3093 const ReductionInfo &RI = En.value();
3094 auto *RedListArrayTy =
3095 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3096 // Reduce element = LocalReduceList[i]
3097 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3098 RedListArrayTy, LocalReduceList,
3099 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3100 // elemptr = ((CopyType*)(elemptrptr)) + I
3101 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3102
3103 // Global = Buffer.VD[Idx];
3104 Value *BufferVD =
3105 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3107 ReductionsBufferTy, BufferVD, 0, En.index());
3108
3109 switch (RI.EvaluationKind) {
3110 case EvalKind::Scalar: {
3111 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3112 Builder.CreateStore(TargetElement, GlobVal);
3113 break;
3114 }
3115 case EvalKind::Complex: {
3117 RI.ElementType, ElemPtr, 0, 0, ".realp");
3118 Value *SrcReal = Builder.CreateLoad(
3119 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3121 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3122 Value *SrcImg = Builder.CreateLoad(
3123 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3124
3126 RI.ElementType, GlobVal, 0, 0, ".realp");
3128 RI.ElementType, GlobVal, 0, 1, ".imagp");
3129 Builder.CreateStore(SrcReal, DestRealPtr);
3130 Builder.CreateStore(SrcImg, DestImgPtr);
3131 break;
3132 }
3133 case EvalKind::Aggregate: {
3134 Value *SizeVal =
3135 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3137 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3138 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3139 break;
3140 }
3141 }
3142 }
3143
3145 Builder.restoreIP(OldIP);
3146 return LtGCFunc;
3147}
3148
3149Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3150 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3151 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3153 LLVMContext &Ctx = M.getContext();
3156 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3157 /* IsVarArg */ false);
3158 Function *LtGRFunc =
3160 "_omp_reduction_list_to_global_reduce_func", &M);
3161 LtGRFunc->setAttributes(FuncAttrs);
3162 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3163 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3164 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3165
3166 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3167 Builder.SetInsertPoint(EntryBlock);
3168
3169 // Buffer: global reduction buffer.
3170 Argument *BufferArg = LtGRFunc->getArg(0);
3171 // Idx: index of the buffer.
3172 Argument *IdxArg = LtGRFunc->getArg(1);
3173 // ReduceList: thread local Reduce list.
3174 Argument *ReduceListArg = LtGRFunc->getArg(2);
3175
3176 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3177 BufferArg->getName() + ".addr");
3178 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3179 IdxArg->getName() + ".addr");
3180 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3181 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3182 auto *RedListArrayTy =
3183 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3184
3185 // 1. Build a list of reduction variables.
3186 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3187 Value *LocalReduceList =
3188 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3189
3191 BufferArgAlloca, Builder.getPtrTy(),
3192 BufferArgAlloca->getName() + ".ascast");
3194 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3195 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3196 ReduceListArgAlloca, Builder.getPtrTy(),
3197 ReduceListArgAlloca->getName() + ".ascast");
3198 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3199 LocalReduceList, Builder.getPtrTy(),
3200 LocalReduceList->getName() + ".ascast");
3201
3202 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3203 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3204 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3205
3206 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3207 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3208 Type *IndexTy = Builder.getIndexTy(
3210 for (auto En : enumerate(ReductionInfos)) {
3211 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3212 RedListArrayTy, LocalReduceListAddrCast,
3213 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3214 Value *BufferVD =
3215 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3216 // Global = Buffer.VD[Idx];
3218 ReductionsBufferTy, BufferVD, 0, En.index());
3219 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3220 }
3221
3222 // Call reduce_function(GlobalReduceList, ReduceList)
3223 Value *ReduceList =
3224 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3225 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3226 ->addFnAttr(Attribute::NoUnwind);
3228 Builder.restoreIP(OldIP);
3229 return LtGRFunc;
3230}
3231
3232Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3233 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3234 AttributeList FuncAttrs) {
3236 LLVMContext &Ctx = M.getContext();
3239 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3240 /* IsVarArg */ false);
3241 Function *LtGCFunc =
3243 "_omp_reduction_global_to_list_copy_func", &M);
3244 LtGCFunc->setAttributes(FuncAttrs);
3245 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3246 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3247 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3248
3249 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3250 Builder.SetInsertPoint(EntryBlock);
3251
3252 // Buffer: global reduction buffer.
3253 Argument *BufferArg = LtGCFunc->getArg(0);
3254 // Idx: index of the buffer.
3255 Argument *IdxArg = LtGCFunc->getArg(1);
3256 // ReduceList: thread local Reduce list.
3257 Argument *ReduceListArg = LtGCFunc->getArg(2);
3258
3259 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3260 BufferArg->getName() + ".addr");
3261 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3262 IdxArg->getName() + ".addr");
3263 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3264 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3266 BufferArgAlloca, Builder.getPtrTy(),
3267 BufferArgAlloca->getName() + ".ascast");
3269 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3270 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3271 ReduceListArgAlloca, Builder.getPtrTy(),
3272 ReduceListArgAlloca->getName() + ".ascast");
3273 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3274 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3275 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3276
3277 Value *LocalReduceList =
3278 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3279 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3280 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3281 Type *IndexTy = Builder.getIndexTy(
3283 for (auto En : enumerate(ReductionInfos)) {
3284 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3285 auto *RedListArrayTy =
3286 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3287 // Reduce element = LocalReduceList[i]
3288 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3289 RedListArrayTy, LocalReduceList,
3290 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3291 // elemptr = ((CopyType*)(elemptrptr)) + I
3292 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3293 // Global = Buffer.VD[Idx];
3294 Value *BufferVD =
3295 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3297 ReductionsBufferTy, BufferVD, 0, En.index());
3298
3299 switch (RI.EvaluationKind) {
3300 case EvalKind::Scalar: {
3301 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3302 Builder.CreateStore(TargetElement, ElemPtr);
3303 break;
3304 }
3305 case EvalKind::Complex: {
3307 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3308 Value *SrcReal = Builder.CreateLoad(
3309 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3311 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3312 Value *SrcImg = Builder.CreateLoad(
3313 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3314
3316 RI.ElementType, ElemPtr, 0, 0, ".realp");
3318 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3319 Builder.CreateStore(SrcReal, DestRealPtr);
3320 Builder.CreateStore(SrcImg, DestImgPtr);
3321 break;
3322 }
3323 case EvalKind::Aggregate: {
3324 Value *SizeVal =
3328 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3329 SizeVal, false);
3330 break;
3331 }
3332 }
3333 }
3334
3336 Builder.restoreIP(OldIP);
3337 return LtGCFunc;
3338}
3339
3340Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3341 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3342 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3344 LLVMContext &Ctx = M.getContext();
3345 auto *FuncTy = FunctionType::get(
3347 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 Function *LtGRFunc =
3351 "_omp_reduction_global_to_list_reduce_func", &M);
3352 LtGRFunc->setAttributes(FuncAttrs);
3353 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3354 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3355 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3356
3357 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3358 Builder.SetInsertPoint(EntryBlock);
3359
3360 // Buffer: global reduction buffer.
3361 Argument *BufferArg = LtGRFunc->getArg(0);
3362 // Idx: index of the buffer.
3363 Argument *IdxArg = LtGRFunc->getArg(1);
3364 // ReduceList: thread local Reduce list.
3365 Argument *ReduceListArg = LtGRFunc->getArg(2);
3366
3367 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3368 BufferArg->getName() + ".addr");
3369 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3370 IdxArg->getName() + ".addr");
3371 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3372 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3373 ArrayType *RedListArrayTy =
3374 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3375
3376 // 1. Build a list of reduction variables.
3377 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3378 Value *LocalReduceList =
3379 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3380
3382 BufferArgAlloca, Builder.getPtrTy(),
3383 BufferArgAlloca->getName() + ".ascast");
3385 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3386 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3387 ReduceListArgAlloca, Builder.getPtrTy(),
3388 ReduceListArgAlloca->getName() + ".ascast");
3390 LocalReduceList, Builder.getPtrTy(),
3391 LocalReduceList->getName() + ".ascast");
3392
3393 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3394 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3395 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3396
3397 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3398 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3399 Type *IndexTy = Builder.getIndexTy(
3401 for (auto En : enumerate(ReductionInfos)) {
3402 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3403 RedListArrayTy, ReductionList,
3404 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3405 // Global = Buffer.VD[Idx];
3406 Value *BufferVD =
3407 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3409 ReductionsBufferTy, BufferVD, 0, En.index());
3410 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3411 }
3412
3413 // Call reduce_function(ReduceList, GlobalReduceList)
3414 Value *ReduceList =
3415 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3416 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3417 ->addFnAttr(Attribute::NoUnwind);
3419 Builder.restoreIP(OldIP);
3420 return LtGRFunc;
3421}
3422
3423std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3424 std::string Suffix =
3425 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3426 return (Name + Suffix).str();
3427}
3428
3429Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3430 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3431 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3432 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3433 {Builder.getPtrTy(), Builder.getPtrTy()},
3434 /* IsVarArg */ false);
3435 std::string Name = getReductionFuncName(ReducerName);
3436 Function *ReductionFunc =
3438 ReductionFunc->setAttributes(FuncAttrs);
3439 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3440 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3441 BasicBlock *EntryBB =
3442 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3443 Builder.SetInsertPoint(EntryBB);
3444
3445 // Need to alloca memory here and deal with the pointers before getting
3446 // LHS/RHS pointers out
3447 Value *LHSArrayPtr = nullptr;
3448 Value *RHSArrayPtr = nullptr;
3449 Argument *Arg0 = ReductionFunc->getArg(0);
3450 Argument *Arg1 = ReductionFunc->getArg(1);
3451 Type *Arg0Type = Arg0->getType();
3452 Type *Arg1Type = Arg1->getType();
3453
3454 Value *LHSAlloca =
3455 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3456 Value *RHSAlloca =
3457 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3459 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3461 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3462 Builder.CreateStore(Arg0, LHSAddrCast);
3463 Builder.CreateStore(Arg1, RHSAddrCast);
3464 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3465 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3466
3467 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3468 Type *IndexTy = Builder.getIndexTy(
3470 SmallVector<Value *> LHSPtrs, RHSPtrs;
3471 for (auto En : enumerate(ReductionInfos)) {
3472 const ReductionInfo &RI = En.value();
3473 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3474 RedArrayTy, RHSArrayPtr,
3475 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3476 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3478 RHSI8Ptr, RI.PrivateVariable->getType(),
3479 RHSI8Ptr->getName() + ".ascast");
3480
3481 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3482 RedArrayTy, LHSArrayPtr,
3483 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3484 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3486 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3487
3489 LHSPtrs.emplace_back(LHSPtr);
3490 RHSPtrs.emplace_back(RHSPtr);
3491 } else {
3492 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3493 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3494 Value *Reduced;
3495 InsertPointOrErrorTy AfterIP =
3496 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3497 if (!AfterIP)
3498 return AfterIP.takeError();
3499 if (!Builder.GetInsertBlock())
3500 return ReductionFunc;
3501 Builder.CreateStore(Reduced, LHSPtr);
3502 }
3503 }
3504
3506 for (auto En : enumerate(ReductionInfos)) {
3507 unsigned Index = En.index();
3508 const ReductionInfo &RI = En.value();
3509 Value *LHSFixupPtr, *RHSFixupPtr;
3510 Builder.restoreIP(RI.ReductionGenClang(
3511 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3512
3513 // Fix the CallBack code genereated to use the correct Values for the LHS
3514 // and RHS
3515 LHSFixupPtr->replaceUsesWithIf(
3516 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3517 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3518 ReductionFunc;
3519 });
3520 RHSFixupPtr->replaceUsesWithIf(
3521 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3522 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3523 ReductionFunc;
3524 });
3525 }
3526
3528 return ReductionFunc;
3529}
3530
3531static void
3533 bool IsGPU) {
3534 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3535 (void)RI;
3536 assert(RI.Variable && "expected non-null variable");
3537 assert(RI.PrivateVariable && "expected non-null private variable");
3538 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3539 "expected non-null reduction generator callback");
3540 if (!IsGPU) {
3541 assert(
3542 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3543 "expected variables and their private equivalents to have the same "
3544 "type");
3545 }
3546 assert(RI.Variable->getType()->isPointerTy() &&
3547 "expected variables to be pointers");
3548 }
3549}
3550
3552 const LocationDescription &Loc, InsertPointTy AllocaIP,
3553 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3554 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3555 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3556 Value *SrcLocInfo) {
3557 if (!updateToLocation(Loc))
3558 return InsertPointTy();
3559 Builder.restoreIP(CodeGenIP);
3560 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3561 LLVMContext &Ctx = M.getContext();
3562
3563 // Source location for the ident struct
3564 if (!SrcLocInfo) {
3565 uint32_t SrcLocStrSize;
3566 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3567 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3568 }
3569
3570 if (ReductionInfos.size() == 0)
3571 return Builder.saveIP();
3572
3573 BasicBlock *ContinuationBlock = nullptr;
3575 // Copied code from createReductions
3576 BasicBlock *InsertBlock = Loc.IP.getBlock();
3577 ContinuationBlock =
3578 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3579 InsertBlock->getTerminator()->eraseFromParent();
3580 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3581 }
3582
3583 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3584 AttributeList FuncAttrs;
3585 AttrBuilder AttrBldr(Ctx);
3586 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3587 AttrBldr.addAttribute(Attr);
3588 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3589 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3590
3591 CodeGenIP = Builder.saveIP();
3592 Expected<Function *> ReductionResult =
3593 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3594 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3595 if (!ReductionResult)
3596 return ReductionResult.takeError();
3597 Function *ReductionFunc = *ReductionResult;
3598 Builder.restoreIP(CodeGenIP);
3599
3600 // Set the grid value in the config needed for lowering later on
3601 if (GridValue.has_value())
3602 Config.setGridValue(GridValue.value());
3603 else
3604 Config.setGridValue(getGridValue(T, ReductionFunc));
3605
3606 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3607 // RedList, shuffle_reduce_func, interwarp_copy_func);
3608 // or
3609 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3610 Value *Res;
3611
3612 // 1. Build a list of reduction variables.
3613 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3614 auto Size = ReductionInfos.size();
3615 Type *PtrTy = PointerType::getUnqual(Ctx);
3616 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3617 CodeGenIP = Builder.saveIP();
3618 Builder.restoreIP(AllocaIP);
3619 Value *ReductionListAlloca =
3620 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3622 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3623 Builder.restoreIP(CodeGenIP);
3624 Type *IndexTy = Builder.getIndexTy(
3626 for (auto En : enumerate(ReductionInfos)) {
3627 const ReductionInfo &RI = En.value();
3628 Value *ElemPtr = Builder.CreateInBoundsGEP(
3629 RedArrayTy, ReductionList,
3630 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3631 Value *CastElem =
3633 Builder.CreateStore(CastElem, ElemPtr);
3634 }
3635 CodeGenIP = Builder.saveIP();
3636 Function *SarFunc =
3637 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3638 Expected<Function *> CopyResult =
3639 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3640 if (!CopyResult)
3641 return CopyResult.takeError();
3642 Function *WcFunc = *CopyResult;
3643 Builder.restoreIP(CodeGenIP);
3644
3645 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3646
3647 unsigned MaxDataSize = 0;
3648 SmallVector<Type *> ReductionTypeArgs;
3649 for (auto En : enumerate(ReductionInfos)) {
3650 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3651 if (Size > MaxDataSize)
3652 MaxDataSize = Size;
3653 ReductionTypeArgs.emplace_back(En.value().ElementType);
3654 }
3655 Value *ReductionDataSize =
3656 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3657 if (!IsTeamsReduction) {
3658 Value *SarFuncCast =
3660 Value *WcFuncCast =
3662 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3663 WcFuncCast};
3665 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3666 Res = Builder.CreateCall(Pv2Ptr, Args);
3667 } else {
3668 CodeGenIP = Builder.saveIP();
3669 StructType *ReductionsBufferTy = StructType::create(
3670 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3671 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3672 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3673 Function *LtGCFunc = emitListToGlobalCopyFunction(
3674 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3675 Function *LtGRFunc = emitListToGlobalReduceFunction(
3676 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3677 Function *GtLCFunc = emitGlobalToListCopyFunction(
3678 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3679 Function *GtLRFunc = emitGlobalToListReduceFunction(
3680 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3681 Builder.restoreIP(CodeGenIP);
3682
3683 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3684 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3685
3686 Value *Args3[] = {SrcLocInfo,
3687 KernelTeamsReductionPtr,
3688 Builder.getInt32(ReductionBufNum),
3689 ReductionDataSize,
3690 RL,
3691 SarFunc,
3692 WcFunc,
3693 LtGCFunc,
3694 LtGRFunc,
3695 GtLCFunc,
3696 GtLRFunc};
3697
3698 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3699 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3700 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3701 }
3702
3703 // 5. Build if (res == 1)
3704 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3705 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3707 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3708
3709 // 6. Build then branch: where we have reduced values in the master
3710 // thread in each team.
3711 // __kmpc_end_reduce{_nowait}(<gtid>);
3712 // break;
3713 emitBlock(ThenBB, CurFunc);
3714
3715 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3716 for (auto En : enumerate(ReductionInfos)) {
3717 const ReductionInfo &RI = En.value();
3718 Value *LHS = RI.Variable;
3719 Value *RHS =
3721
3723 Value *LHSPtr, *RHSPtr;
3725 &LHSPtr, &RHSPtr, CurFunc));
3726
3727 // Fix the CallBack code genereated to use the correct Values for the LHS
3728 // and RHS
3729 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3730 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3731 ReductionFunc;
3732 });
3733 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3734 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3735 ReductionFunc;
3736 });
3737 } else {
3738 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3739 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3740 Value *Reduced;
3741 InsertPointOrErrorTy AfterIP =
3742 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3743 if (!AfterIP)
3744 return AfterIP.takeError();
3745 Builder.CreateStore(Reduced, LHS, false);
3746 }
3747 }
3748 emitBlock(ExitBB, CurFunc);
3749 if (ContinuationBlock) {
3750 Builder.CreateBr(ContinuationBlock);
3751 Builder.SetInsertPoint(ContinuationBlock);
3752 }
3754
3755 return Builder.saveIP();
3756}
3757
3759 Type *VoidTy = Type::getVoidTy(M.getContext());
3760 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3761 auto *FuncTy =
3762 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3764 ".omp.reduction.func", &M);
3765}
3766
3768 Function *ReductionFunc,
3770 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3771 Module *Module = ReductionFunc->getParent();
3772 BasicBlock *ReductionFuncBlock =
3773 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3774 Builder.SetInsertPoint(ReductionFuncBlock);
3775 Value *LHSArrayPtr = nullptr;
3776 Value *RHSArrayPtr = nullptr;
3777 if (IsGPU) {
3778 // Need to alloca memory here and deal with the pointers before getting
3779 // LHS/RHS pointers out
3780 //
3781 Argument *Arg0 = ReductionFunc->getArg(0);
3782 Argument *Arg1 = ReductionFunc->getArg(1);
3783 Type *Arg0Type = Arg0->getType();
3784 Type *Arg1Type = Arg1->getType();
3785
3786 Value *LHSAlloca =
3787 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3788 Value *RHSAlloca =
3789 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3790 Value *LHSAddrCast =
3791 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3792 Value *RHSAddrCast =
3793 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3794 Builder.CreateStore(Arg0, LHSAddrCast);
3795 Builder.CreateStore(Arg1, RHSAddrCast);
3796 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3797 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3798 } else {
3799 LHSArrayPtr = ReductionFunc->getArg(0);
3800 RHSArrayPtr = ReductionFunc->getArg(1);
3801 }
3802
3803 unsigned NumReductions = ReductionInfos.size();
3804 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3805
3806 for (auto En : enumerate(ReductionInfos)) {
3807 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3808 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3809 RedArrayTy, LHSArrayPtr, 0, En.index());
3810 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3812 LHSI8Ptr, RI.Variable->getType());
3813 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3814 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3815 RedArrayTy, RHSArrayPtr, 0, En.index());
3816 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3818 RHSI8Ptr, RI.PrivateVariable->getType());
3819 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3820 Value *Reduced;
3822 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3823 if (!AfterIP)
3824 return AfterIP.takeError();
3825
3826 Builder.restoreIP(*AfterIP);
3827 // TODO: Consider flagging an error.
3828 if (!Builder.GetInsertBlock())
3829 return Error::success();
3830
3831 // store is inside of the reduction region when using by-ref
3832 if (!IsByRef[En.index()])
3833 Builder.CreateStore(Reduced, LHSPtr);
3834 }
3835 Builder.CreateRetVoid();
3836 return Error::success();
3837}
3838
3840 const LocationDescription &Loc, InsertPointTy AllocaIP,
3841 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3842 bool IsNoWait, bool IsTeamsReduction) {
3843 assert(ReductionInfos.size() == IsByRef.size());
3844 if (Config.isGPU())
3845 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3846 IsNoWait, IsTeamsReduction);
3847
3848 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3849
3850 if (!updateToLocation(Loc))
3851 return InsertPointTy();
3852
3853 if (ReductionInfos.size() == 0)
3854 return Builder.saveIP();
3855
3856 BasicBlock *InsertBlock = Loc.IP.getBlock();
3857 BasicBlock *ContinuationBlock =
3858 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3859 InsertBlock->getTerminator()->eraseFromParent();
3860
3861 // Create and populate array of type-erased pointers to private reduction
3862 // values.
3863 unsigned NumReductions = ReductionInfos.size();
3864 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3866 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3867
3868 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3869
3870 for (auto En : enumerate(ReductionInfos)) {
3871 unsigned Index = En.index();
3872 const ReductionInfo &RI = En.value();
3873 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3874 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3875 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3876 }
3877
3878 // Emit a call to the runtime function that orchestrates the reduction.
3879 // Declare the reduction function in the process.
3880 Type *IndexTy = Builder.getIndexTy(
3883 Module *Module = Func->getParent();
3884 uint32_t SrcLocStrSize;
3885 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3886 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3887 return RI.AtomicReductionGen;
3888 });
3889 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3890 CanGenerateAtomic
3891 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3892 : IdentFlag(0));
3893 Value *ThreadId = getOrCreateThreadID(Ident);
3894 Constant *NumVariables = Builder.getInt32(NumReductions);
3895 const DataLayout &DL = Module->getDataLayout();
3896 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3897 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3898 Function *ReductionFunc = getFreshReductionFunc(*Module);
3899 Value *Lock = getOMPCriticalRegionLock(".reduction");
3901 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3902 : RuntimeFunction::OMPRTL___kmpc_reduce);
3903 CallInst *ReduceCall =
3904 Builder.CreateCall(ReduceFunc,
3905 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3906 ReductionFunc, Lock},
3907 "reduce");
3908
3909 // Create final reduction entry blocks for the atomic and non-atomic case.
3910 // Emit IR that dispatches control flow to one of the blocks based on the
3911 // reduction supporting the atomic mode.
3912 BasicBlock *NonAtomicRedBlock =
3913 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3914 BasicBlock *AtomicRedBlock =
3915 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3916 SwitchInst *Switch =
3917 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3918 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3919 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3920
3921 // Populate the non-atomic reduction using the elementwise reduction function.
3922 // This loads the elements from the global and private variables and reduces
3923 // them before storing back the result to the global variable.
3924 Builder.SetInsertPoint(NonAtomicRedBlock);
3925 for (auto En : enumerate(ReductionInfos)) {
3926 const ReductionInfo &RI = En.value();
3928 // We have one less load for by-ref case because that load is now inside of
3929 // the reduction region
3930 Value *RedValue = RI.Variable;
3931 if (!IsByRef[En.index()]) {
3932 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3933 "red.value." + Twine(En.index()));
3934 }
3935 Value *PrivateRedValue =
3937 "red.private.value." + Twine(En.index()));
3938 Value *Reduced;
3939 InsertPointOrErrorTy AfterIP =
3940 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3941 if (!AfterIP)
3942 return AfterIP.takeError();
3943 Builder.restoreIP(*AfterIP);
3944
3945 if (!Builder.GetInsertBlock())
3946 return InsertPointTy();
3947 // for by-ref case, the load is inside of the reduction region
3948 if (!IsByRef[En.index()])
3949 Builder.CreateStore(Reduced, RI.Variable);
3950 }
3951 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3952 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3953 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3954 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3955 Builder.CreateBr(ContinuationBlock);
3956
3957 // Populate the atomic reduction using the atomic elementwise reduction
3958 // function. There are no loads/stores here because they will be happening
3959 // inside the atomic elementwise reduction.
3960 Builder.SetInsertPoint(AtomicRedBlock);
3961 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3962 for (const ReductionInfo &RI : ReductionInfos) {
3965 if (!AfterIP)
3966 return AfterIP.takeError();
3967 Builder.restoreIP(*AfterIP);
3968 if (!Builder.GetInsertBlock())
3969 return InsertPointTy();
3970 }
3971 Builder.CreateBr(ContinuationBlock);
3972 } else {
3974 }
3975
3976 // Populate the outlined reduction function using the elementwise reduction
3977 // function. Partial values are extracted from the type-erased array of
3978 // pointers to private variables.
3979 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3980 IsByRef, /*isGPU=*/false);
3981 if (Err)
3982 return Err;
3983
3984 if (!Builder.GetInsertBlock())
3985 return InsertPointTy();
3986
3987 Builder.SetInsertPoint(ContinuationBlock);
3988 return Builder.saveIP();
3989}
3990
3993 BodyGenCallbackTy BodyGenCB,
3994 FinalizeCallbackTy FiniCB) {
3995 if (!updateToLocation(Loc))
3996 return Loc.IP;
3997
3998 Directive OMPD = Directive::OMPD_master;
3999 uint32_t SrcLocStrSize;
4000 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4001 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4002 Value *ThreadId = getOrCreateThreadID(Ident);
4003 Value *Args[] = {Ident, ThreadId};
4004
4005 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4006 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4007
4008 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4009 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4010
4011 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4012 /*Conditional*/ true, /*hasFinalize*/ true);
4013}
4014
4017 BodyGenCallbackTy BodyGenCB,
4018 FinalizeCallbackTy FiniCB, Value *Filter) {
4019 if (!updateToLocation(Loc))
4020 return Loc.IP;
4021
4022 Directive OMPD = Directive::OMPD_masked;
4023 uint32_t SrcLocStrSize;
4024 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4025 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4026 Value *ThreadId = getOrCreateThreadID(Ident);
4027 Value *Args[] = {Ident, ThreadId, Filter};
4028 Value *ArgsEnd[] = {Ident, ThreadId};
4029
4030 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4031 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4032
4033 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4034 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4035
4036 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4037 /*Conditional*/ true, /*hasFinalize*/ true);
4038}
4039
4041 llvm::FunctionCallee Callee,
4043 const llvm::Twine &Name) {
4044 llvm::CallInst *Call = Builder.CreateCall(
4046 Call->setDoesNotThrow();
4047 return Call;
4048}
4049
4050// Expects input basic block is dominated by BeforeScanBB.
4051// Once Scan directive is encountered, the code after scan directive should be
4052// dominated by AfterScanBB. Scan directive splits the code sequence to
4053// scan and input phase. Based on whether inclusive or exclusive
4054// clause is used in the scan directive and whether input loop or scan loop
4055// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4056// input loop and second is the scan loop. The code generated handles only
4057// inclusive scans now.
4059 const LocationDescription &Loc, InsertPointTy AllocaIP,
4060 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4061 bool IsInclusive, ScanInfo *ScanRedInfo) {
4062 if (ScanRedInfo->OMPFirstScanLoop) {
4063 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4064 ScanVarsType, ScanRedInfo);
4065 if (Err)
4066 return Err;
4067 }
4068 if (!updateToLocation(Loc))
4069 return Loc.IP;
4070
4071 llvm::Value *IV = ScanRedInfo->IV;
4072
4073 if (ScanRedInfo->OMPFirstScanLoop) {
4074 // Emit buffer[i] = red; at the end of the input phase.
4075 for (size_t i = 0; i < ScanVars.size(); i++) {
4076 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4077 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4078 Type *DestTy = ScanVarsType[i];
4079 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4080 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4081
4082 Builder.CreateStore(Src, Val);
4083 }
4084 }
4085 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4086 emitBlock(ScanRedInfo->OMPScanDispatch,
4088
4089 if (!ScanRedInfo->OMPFirstScanLoop) {
4090 IV = ScanRedInfo->IV;
4091 // Emit red = buffer[i]; at the entrance to the scan phase.
4092 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4093 for (size_t i = 0; i < ScanVars.size(); i++) {
4094 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4095 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4096 Type *DestTy = ScanVarsType[i];
4097 Value *SrcPtr =
4098 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4099 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4100 Builder.CreateStore(Src, ScanVars[i]);
4101 }
4102 }
4103
4104 // TODO: Update it to CreateBr and remove dead blocks
4105 llvm::Value *CmpI = Builder.getInt1(true);
4106 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4107 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4108 ScanRedInfo->OMPAfterScanBlock);
4109 } else {
4110 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4111 ScanRedInfo->OMPBeforeScanBlock);
4112 }
4113 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4116 return Builder.saveIP();
4117}
4118
4119Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4120 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4121 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4122
4123 Builder.restoreIP(AllocaIP);
4124 // Create the shared pointer at alloca IP.
4125 for (size_t i = 0; i < ScanVars.size(); i++) {
4126 llvm::Value *BuffPtr =
4127 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4128 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4129 }
4130
4131 // Allocate temporary buffer by master thread
4132 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4133 InsertPointTy CodeGenIP) -> Error {
4134 Builder.restoreIP(CodeGenIP);
4135 Value *AllocSpan =
4136 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4137 for (size_t i = 0; i < ScanVars.size(); i++) {
4138 Type *IntPtrTy = Builder.getInt32Ty();
4139 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4140 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4141 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4142 AllocSpan, nullptr, "arr");
4143 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4144 }
4145 return Error::success();
4146 };
4147 // TODO: Perform finalization actions for variables. This has to be
4148 // called for variables which have destructors/finalizers.
4149 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4150
4152 llvm::Value *FilterVal = Builder.getInt32(0);
4154 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4155
4156 if (!AfterIP)
4157 return AfterIP.takeError();
4158 Builder.restoreIP(*AfterIP);
4159 BasicBlock *InputBB = Builder.GetInsertBlock();
4160 if (InputBB->getTerminator())
4162 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4163 if (!AfterIP)
4164 return AfterIP.takeError();
4165 Builder.restoreIP(*AfterIP);
4166
4167 return Error::success();
4168}
4169
4170Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4171 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4172 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4173 InsertPointTy CodeGenIP) -> Error {
4174 Builder.restoreIP(CodeGenIP);
4175 for (ReductionInfo RedInfo : ReductionInfos) {
4176 Value *PrivateVar = RedInfo.PrivateVariable;
4177 Value *OrigVar = RedInfo.Variable;
4178 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4179 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4180
4181 Type *SrcTy = RedInfo.ElementType;
4182 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4183 "arrayOffset");
4184 Value *Src = Builder.CreateLoad(SrcTy, Val);
4185
4186 Builder.CreateStore(Src, OrigVar);
4187 Builder.CreateFree(Buff);
4188 }
4189 return Error::success();
4190 };
4191 // TODO: Perform finalization actions for variables. This has to be
4192 // called for variables which have destructors/finalizers.
4193 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4194
4195 if (ScanRedInfo->OMPScanFinish->getTerminator())
4197 else
4198 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4199
4200 llvm::Value *FilterVal = Builder.getInt32(0);
4202 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4203
4204 if (!AfterIP)
4205 return AfterIP.takeError();
4206 Builder.restoreIP(*AfterIP);
4207 BasicBlock *InputBB = Builder.GetInsertBlock();
4208 if (InputBB->getTerminator())
4210 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4211 if (!AfterIP)
4212 return AfterIP.takeError();
4213 Builder.restoreIP(*AfterIP);
4214 return Error::success();
4215}
4216
4218 const LocationDescription &Loc,
4220 ScanInfo *ScanRedInfo) {
4221
4222 if (!updateToLocation(Loc))
4223 return Loc.IP;
4224 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4225 InsertPointTy CodeGenIP) -> Error {
4226 Builder.restoreIP(CodeGenIP);
4228 // for (int k = 0; k <= ceil(log2(n)); ++k)
4229 llvm::BasicBlock *LoopBB =
4230 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4231 llvm::BasicBlock *ExitBB =
4232 splitBB(Builder, false, "omp.outer.log.scan.exit");
4235 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4237 llvm::Value *Arg =
4238 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4239 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4242 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4243 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4244 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4246 ScanRedInfo->Span,
4247 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4248 Builder.SetInsertPoint(InputBB);
4249 Builder.CreateBr(LoopBB);
4250 emitBlock(LoopBB, CurFn);
4251 Builder.SetInsertPoint(LoopBB);
4252
4253 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4254 // size pow2k = 1;
4256 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4257 InputBB);
4258 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4259 InputBB);
4260 // for (size i = n - 1; i >= 2 ^ k; --i)
4261 // tmp[i] op= tmp[i-pow2k];
4262 llvm::BasicBlock *InnerLoopBB =
4263 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4264 llvm::BasicBlock *InnerExitBB =
4265 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4266 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4267 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4268 emitBlock(InnerLoopBB, CurFn);
4269 Builder.SetInsertPoint(InnerLoopBB);
4271 IVal->addIncoming(NMin1, LoopBB);
4272 for (ReductionInfo RedInfo : ReductionInfos) {
4273 Value *ReductionVal = RedInfo.PrivateVariable;
4274 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4275 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4276 Type *DestTy = RedInfo.ElementType;
4278 Value *LHSPtr =
4279 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4280 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4281 Value *RHSPtr =
4282 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4283 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4284 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4285 llvm::Value *Result;
4286 InsertPointOrErrorTy AfterIP =
4287 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4288 if (!AfterIP)
4289 return AfterIP.takeError();
4290 Builder.CreateStore(Result, LHSPtr);
4291 }
4292 llvm::Value *NextIVal = Builder.CreateNUWSub(
4293 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4294 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4295 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4296 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4297 emitBlock(InnerExitBB, CurFn);
4299 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4300 Counter->addIncoming(Next, Builder.GetInsertBlock());
4301 // pow2k <<= 1;
4302 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4303 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4304 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4305 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4307 return Error::success();
4308 };
4309
4310 // TODO: Perform finalization actions for variables. This has to be
4311 // called for variables which have destructors/finalizers.
4312 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4313
4314 llvm::Value *FilterVal = Builder.getInt32(0);
4316 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4317
4318 if (!AfterIP)
4319 return AfterIP.takeError();
4320 Builder.restoreIP(*AfterIP);
4321 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4322
4323 if (!AfterIP)
4324 return AfterIP.takeError();
4325 Builder.restoreIP(*AfterIP);
4326 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4327 if (Err)
4328 return Err;
4329
4330 return AfterIP;
4331}
4332
4333Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4334 llvm::function_ref<Error()> InputLoopGen,
4335 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4336 ScanInfo *ScanRedInfo) {
4337
4338 {
4339 // Emit loop with input phase:
4340 // for (i: 0..<num_iters>) {
4341 // <input phase>;
4342 // buffer[i] = red;
4343 // }
4344 ScanRedInfo->OMPFirstScanLoop = true;
4345 Error Err = InputLoopGen();
4346 if (Err)
4347 return Err;
4348 }
4349 {
4350 // Emit loop with scan phase:
4351 // for (i: 0..<num_iters>) {
4352 // red = buffer[i];
4353 // <scan phase>;
4354 // }
4355 ScanRedInfo->OMPFirstScanLoop = false;
4356 Error Err = ScanLoopGen(Builder.saveIP());
4357 if (Err)
4358 return Err;
4359 }
4360 return Error::success();
4361}
4362
4363void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4365 ScanRedInfo->OMPScanDispatch =
4366 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4367 ScanRedInfo->OMPAfterScanBlock =
4368 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4369 ScanRedInfo->OMPBeforeScanBlock =
4370 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4371 ScanRedInfo->OMPScanLoopExit =
4372 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4373}
4375 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4376 BasicBlock *PostInsertBefore, const Twine &Name) {
4377 Module *M = F->getParent();
4378 LLVMContext &Ctx = M->getContext();
4379 Type *IndVarTy = TripCount->getType();
4380
4381 // Create the basic block structure.
4382 BasicBlock *Preheader =
4383 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4384 BasicBlock *Header =
4385 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4386 BasicBlock *Cond =
4387 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4388 BasicBlock *Body =
4389 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4390 BasicBlock *Latch =
4391 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4392 BasicBlock *Exit =
4393 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4394 BasicBlock *After =
4395 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4396
4397 // Use specified DebugLoc for new instructions.
4399
4400 Builder.SetInsertPoint(Preheader);
4401 Builder.CreateBr(Header);
4402
4403 Builder.SetInsertPoint(Header);
4404 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4405 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4407
4409 Value *Cmp =
4410 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4411 Builder.CreateCondBr(Cmp, Body, Exit);
4412
4413 Builder.SetInsertPoint(Body);
4414 Builder.CreateBr(Latch);
4415
4416 Builder.SetInsertPoint(Latch);
4417 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4418 "omp_" + Name + ".next", /*HasNUW=*/true);
4419 Builder.CreateBr(Header);
4420 IndVarPHI->addIncoming(Next, Latch);
4421
4422 Builder.SetInsertPoint(Exit);
4423 Builder.CreateBr(After);
4424
4425 // Remember and return the canonical control flow.
4426 LoopInfos.emplace_front();
4427 CanonicalLoopInfo *CL = &LoopInfos.front();
4428
4429 CL->Header = Header;
4430 CL->Cond = Cond;
4431 CL->Latch = Latch;
4432 CL->Exit = Exit;
4433
4434#ifndef NDEBUG
4435 CL->assertOK();
4436#endif
4437 return CL;
4438}
4439
4442 LoopBodyGenCallbackTy BodyGenCB,
4443 Value *TripCount, const Twine &Name) {
4444 BasicBlock *BB = Loc.IP.getBlock();
4445 BasicBlock *NextBB = BB->getNextNode();
4446
4447 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4448 NextBB, NextBB, Name);
4449 BasicBlock *After = CL->getAfter();
4450
4451 // If location is not set, don't connect the loop.
4452 if (updateToLocation(Loc)) {
4453 // Split the loop at the insertion point: Branch to the preheader and move
4454 // every following instruction to after the loop (the After BB). Also, the
4455 // new successor is the loop's after block.
4456 spliceBB(Builder, After, /*CreateBranch=*/false);
4458 }
4459
4460 // Emit the body content. We do it after connecting the loop to the CFG to
4461 // avoid that the callback encounters degenerate BBs.
4462 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4463 return Err;
4464
4465#ifndef NDEBUG
4466 CL->assertOK();
4467#endif
4468 return CL;
4469}
4470
4472 ScanInfos.emplace_front();
4473 ScanInfo *Result = &ScanInfos.front();
4474 return Result;
4475}
4476
4479 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4480 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4481 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4482 LocationDescription ComputeLoc =
4483 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4484 updateToLocation(ComputeLoc);
4485
4487
4489 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4490 ScanRedInfo->Span = TripCount;
4491 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4492 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4493
4494 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4495 Builder.restoreIP(CodeGenIP);
4496 ScanRedInfo->IV = IV;
4497 createScanBBs(ScanRedInfo);
4498 BasicBlock *InputBlock = Builder.GetInsertBlock();
4499 Instruction *Terminator = InputBlock->getTerminator();
4500 assert(Terminator->getNumSuccessors() == 1);
4501 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4502 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4503 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4505 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4506 emitBlock(ScanRedInfo->OMPScanLoopExit,
4508 Builder.CreateBr(ContinueBlock);
4510 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4511 return BodyGenCB(Builder.saveIP(), IV);
4512 };
4513
4514 const auto &&InputLoopGen = [&]() -> Error {
4516 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4517 ComputeIP, Name, true, ScanRedInfo);
4518 if (!LoopInfo)
4519 return LoopInfo.takeError();
4520 Result.push_back(*LoopInfo);
4521 Builder.restoreIP((*LoopInfo)->getAfterIP());
4522 return Error::success();
4523 };
4524 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4526 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4527 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4528 if (!LoopInfo)
4529 return LoopInfo.takeError();
4530 Result.push_back(*LoopInfo);
4531 Builder.restoreIP((*LoopInfo)->getAfterIP());
4532 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4533 return Error::success();
4534 };
4535 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4536 if (Err)
4537 return Err;
4538 return Result;
4539}
4540
4542 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4543 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4544
4545 // Consider the following difficulties (assuming 8-bit signed integers):
4546 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4547 // DO I = 1, 100, 50
4548 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4549 // DO I = 100, 0, -128
4550
4551 // Start, Stop and Step must be of the same integer type.
4552 auto *IndVarTy = cast<IntegerType>(Start->getType());
4553 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4554 assert(IndVarTy == Step->getType() && "Step type mismatch");
4555
4556 updateToLocation(Loc);
4557
4558 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4559 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4560
4561 // Like Step, but always positive.
4562 Value *Incr = Step;
4563
4564 // Distance between Start and Stop; always positive.
4565 Value *Span;
4566
4567 // Condition whether there are no iterations are executed at all, e.g. because
4568 // UB < LB.
4569 Value *ZeroCmp;
4570
4571 if (IsSigned) {
4572 // Ensure that increment is positive. If not, negate and invert LB and UB.
4573 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4574 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4575 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4576 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4577 Span = Builder.CreateSub(UB, LB, "", false, true);
4578 ZeroCmp = Builder.CreateICmp(
4579 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4580 } else {
4581 Span = Builder.CreateSub(Stop, Start, "", true);
4582 ZeroCmp = Builder.CreateICmp(
4583 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4584 }
4585
4586 Value *CountIfLooping;
4587 if (InclusiveStop) {
4588 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4589 } else {
4590 // Avoid incrementing past stop since it could overflow.
4591 Value *CountIfTwo = Builder.CreateAdd(
4592 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4593 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4594 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4595 }
4596
4597 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4598 "omp_" + Name + ".tripcount");
4599}
4600
4602 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4603 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4604 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4605 ScanInfo *ScanRedInfo) {
4606 LocationDescription ComputeLoc =
4607 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4608
4610 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4611
4612 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4613 Builder.restoreIP(CodeGenIP);
4614 Value *Span = Builder.CreateMul(IV, Step);
4615 Value *IndVar = Builder.CreateAdd(Span, Start);
4616 if (InScan)
4617 ScanRedInfo->IV = IndVar;
4618 return BodyGenCB(Builder.saveIP(), IndVar);
4619 };
4620 LocationDescription LoopLoc =
4621 ComputeIP.isSet()
4622 ? Loc
4625 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4626}
4627
4628// Returns an LLVM function to call for initializing loop bounds using OpenMP
4629// static scheduling for composite `distribute parallel for` depending on
4630// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4631// integers as unsigned similarly to CanonicalLoopInfo.
4632static FunctionCallee
4634 OpenMPIRBuilder &OMPBuilder) {
4635 unsigned Bitwidth = Ty->getIntegerBitWidth();
4636 if (Bitwidth == 32)
4637 return OMPBuilder.getOrCreateRuntimeFunction(
4638 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4639 if (Bitwidth == 64)
4640 return OMPBuilder.getOrCreateRuntimeFunction(
4641 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4642 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4643}
4644
4645// Returns an LLVM function to call for initializing loop bounds using OpenMP
4646// static scheduling depending on `type`. Only i32 and i64 are supported by the
4647// runtime. Always interpret integers as unsigned similarly to
4648// CanonicalLoopInfo.
4650 OpenMPIRBuilder &OMPBuilder) {
4651 unsigned Bitwidth = Ty->getIntegerBitWidth();
4652 if (Bitwidth == 32)
4653 return OMPBuilder.getOrCreateRuntimeFunction(
4654 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4655 if (Bitwidth == 64)
4656 return OMPBuilder.getOrCreateRuntimeFunction(
4657 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4658 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4659}
4660
4661OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4662 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4663 WorksharingLoopType LoopType, bool NeedsBarrier) {
4664 assert(CLI->isValid() && "Requires a valid canonical loop");
4665 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4666 "Require dedicated allocate IP");
4667
4668 // Set up the source location value for OpenMP runtime.
4671
4672 uint32_t SrcLocStrSize;
4673 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4674 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4675
4676 // Declare useful OpenMP runtime functions.
4677 Value *IV = CLI->getIndVar();
4678 Type *IVTy = IV->getType();
4679 FunctionCallee StaticInit =
4680 LoopType == WorksharingLoopType::DistributeForStaticLoop
4681 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4682 : getKmpcForStaticInitForType(IVTy, M, *this);
4683 FunctionCallee StaticFini =
4684 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4685
4686 // Allocate space for computed loop bounds as expected by the "init" function.
4687 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4688
4689 Type *I32Type = Type::getInt32Ty(M.getContext());
4690 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4691 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4692 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4693 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4694 CLI->setLastIter(PLastIter);
4695
4696 // At the end of the preheader, prepare for calling the "init" function by
4697 // storing the current loop bounds into the allocated space. A canonical loop
4698 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4699 // and produces an inclusive upper bound.
4701 Constant *Zero = ConstantInt::get(IVTy, 0);
4702 Constant *One = ConstantInt::get(IVTy, 1);
4703 Builder.CreateStore(Zero, PLowerBound);
4704 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4705 Builder.CreateStore(UpperBound, PUpperBound);
4706 Builder.CreateStore(One, PStride);
4707
4708 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4709
4710 OMPScheduleType SchedType =
4711 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4712 ? OMPScheduleType::OrderedDistribute
4713 : OMPScheduleType::UnorderedStatic;
4714 Constant *SchedulingType =
4715 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4716
4717 // Call the "init" function and update the trip count of the loop with the
4718 // value it produced.
4720 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4721 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4722 Value *PDistUpperBound =
4723 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4724 Args.push_back(PDistUpperBound);
4725 }
4726 Args.append({PStride, One, Zero});
4727 Builder.CreateCall(StaticInit, Args);
4728 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4729 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4730 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4731 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4732 CLI->setTripCount(TripCount);
4733
4734 // Update all uses of the induction variable except the one in the condition
4735 // block that compares it with the actual upper bound, and the increment in
4736 // the latch block.
4737
4738 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4740 CLI->getBody()->getFirstInsertionPt());
4742 return Builder.CreateAdd(OldIV, LowerBound);
4743 });
4744
4745 // In the "exit" block, call the "fini" function.
4747 CLI->getExit()->getTerminator()->getIterator());
4748 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4749
4750 // Add the barrier if requested.
4751 if (NeedsBarrier) {
4752 InsertPointOrErrorTy BarrierIP =
4753 createBarrier(LocationDescription(Builder.saveIP(), DL),
4754 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4755 /* CheckCancelFlag */ false);
4756 if (!BarrierIP)
4757 return BarrierIP.takeError();
4758 }
4759
4760 InsertPointTy AfterIP = CLI->getAfterIP();
4761 CLI->invalidate();
4762
4763 return AfterIP;
4764}
4765
4767OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4768 CanonicalLoopInfo *CLI,
4769 InsertPointTy AllocaIP,
4770 bool NeedsBarrier,
4771 Value *ChunkSize) {
4772 assert(CLI->isValid() && "Requires a valid canonical loop");
4773 assert(ChunkSize && "Chunk size is required");
4774
4775 LLVMContext &Ctx = CLI->getFunction()->getContext();
4776 Value *IV = CLI->getIndVar();
4777 Value *OrigTripCount = CLI->getTripCount();
4778 Type *IVTy = IV->getType();
4779 assert(IVTy->getIntegerBitWidth() <= 64 &&
4780 "Max supported tripcount bitwidth is 64 bits");
4781 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4782 : Type::getInt64Ty(Ctx);
4783 Type *I32Type = Type::getInt32Ty(M.getContext());
4784 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4785 Constant *One = ConstantInt::get(InternalIVTy, 1);
4786
4787 // Declare useful OpenMP runtime functions.
4788 FunctionCallee StaticInit =
4789 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4790 FunctionCallee StaticFini =
4791 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4792
4793 // Allocate space for computed loop bounds as expected by the "init" function.
4794 Builder.restoreIP(AllocaIP);
4796 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4797 Value *PLowerBound =
4798 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4799 Value *PUpperBound =
4800 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4801 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4802 CLI->setLastIter(PLastIter);
4803
4804 // Set up the source location value for the OpenMP runtime.
4807
4808 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4809 Value *CastedChunkSize =
4810 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4811 Value *CastedTripCount =
4812 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4813
4814 Constant *SchedulingType = ConstantInt::get(
4815 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4816 Builder.CreateStore(Zero, PLowerBound);
4817 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4818 Builder.CreateStore(OrigUpperBound, PUpperBound);
4819 Builder.CreateStore(One, PStride);
4820
4821 // Call the "init" function and update the trip count of the loop with the
4822 // value it produced.
4823 uint32_t SrcLocStrSize;
4824 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4825 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4826 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4827 Builder.CreateCall(StaticInit,
4828 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4829 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4830 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4831 /*pstride=*/PStride, /*incr=*/One,
4832 /*chunk=*/CastedChunkSize});
4833
4834 // Load values written by the "init" function.
4835 Value *FirstChunkStart =
4836 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4837 Value *FirstChunkStop =
4838 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4839 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4840 Value *ChunkRange =
4841 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4842 Value *NextChunkStride =
4843 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4844
4845 // Create outer "dispatch" loop for enumerating the chunks.
4846 BasicBlock *DispatchEnter = splitBB(Builder, true);
4847 Value *DispatchCounter;
4848
4849 // It is safe to assume this didn't return an error because the callback
4850 // passed into createCanonicalLoop is the only possible error source, and it
4851 // always returns success.
4853 {Builder.saveIP(), DL},
4854 [&](InsertPointTy BodyIP, Value *Counter) {
4855 DispatchCounter = Counter;
4856 return Error::success();
4857 },
4858 FirstChunkStart, CastedTripCount, NextChunkStride,
4859 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4860 "dispatch"));
4861
4862 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4863 // not have to preserve the canonical invariant.
4864 BasicBlock *DispatchBody = DispatchCLI->getBody();
4865 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4866 BasicBlock *DispatchExit = DispatchCLI->getExit();
4867 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4868 DispatchCLI->invalidate();
4869
4870 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4871 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4872 redirectTo(CLI->getExit(), DispatchLatch, DL);
4873 redirectTo(DispatchBody, DispatchEnter, DL);
4874
4875 // Prepare the prolog of the chunk loop.
4878
4879 // Compute the number of iterations of the chunk loop.
4881 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4882 Value *IsLastChunk =
4883 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4884 Value *CountUntilOrigTripCount =
4885 Builder.CreateSub(CastedTripCount, DispatchCounter);
4886 Value *ChunkTripCount = Builder.CreateSelect(
4887 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4888 Value *BackcastedChunkTC =
4889 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4890 CLI->setTripCount(BackcastedChunkTC);
4891
4892 // Update all uses of the induction variable except the one in the condition
4893 // block that compares it with the actual upper bound, and the increment in
4894 // the latch block.
4895 Value *BackcastedDispatchCounter =
4896 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4897 CLI->mapIndVar([&](Instruction *) -> Value * {
4898 Builder.restoreIP(CLI->getBodyIP());
4899 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4900 });
4901
4902 // In the "exit" block, call the "fini" function.
4903 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4904 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4905
4906 // Add the barrier if requested.
4907 if (NeedsBarrier) {
4908 InsertPointOrErrorTy AfterIP =
4909 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4910 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4911 if (!AfterIP)
4912 return AfterIP.takeError();
4913 }
4914
4915#ifndef NDEBUG
4916 // Even though we currently do not support applying additional methods to it,
4917 // the chunk loop should remain a canonical loop.
4918 CLI->assertOK();
4919#endif
4920
4921 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4922}
4923
4924// Returns an LLVM function to call for executing an OpenMP static worksharing
4925// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4926// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4927static FunctionCallee
4929 WorksharingLoopType LoopType) {
4930 unsigned Bitwidth = Ty->getIntegerBitWidth();
4931 Module &M = OMPBuilder->M;
4932 switch (LoopType) {
4933 case WorksharingLoopType::ForStaticLoop:
4934 if (Bitwidth == 32)
4935 return OMPBuilder->getOrCreateRuntimeFunction(
4936 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4937 if (Bitwidth == 64)
4938 return OMPBuilder->getOrCreateRuntimeFunction(
4939 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4940 break;
4941 case WorksharingLoopType::DistributeStaticLoop:
4942 if (Bitwidth == 32)
4943 return OMPBuilder->getOrCreateRuntimeFunction(
4944 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4945 if (Bitwidth == 64)
4946 return OMPBuilder->getOrCreateRuntimeFunction(
4947 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4948 break;
4949 case WorksharingLoopType::DistributeForStaticLoop:
4950 if (Bitwidth == 32)
4951 return OMPBuilder->getOrCreateRuntimeFunction(
4952 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4953 if (Bitwidth == 64)
4954 return OMPBuilder->getOrCreateRuntimeFunction(
4955 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4956 break;
4957 }
4958 if (Bitwidth != 32 && Bitwidth != 64) {
4959 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4960 }
4961 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4962}
4963
4964// Inserts a call to proper OpenMP Device RTL function which handles
4965// loop worksharing.
4967 WorksharingLoopType LoopType,
4968 BasicBlock *InsertBlock, Value *Ident,
4969 Value *LoopBodyArg, Value *TripCount,
4970 Function &LoopBodyFn) {
4971 Type *TripCountTy = TripCount->getType();
4972 Module &M = OMPBuilder->M;
4973 IRBuilder<> &Builder = OMPBuilder->Builder;
4974 FunctionCallee RTLFn =
4975 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4976 SmallVector<Value *, 8> RealArgs;
4977 RealArgs.push_back(Ident);
4978 RealArgs.push_back(&LoopBodyFn);
4979 RealArgs.push_back(LoopBodyArg);
4980 RealArgs.push_back(TripCount);
4981 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4982 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4983 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4984 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4985 Builder.CreateCall(RTLFn, RealArgs);
4986 return;
4987 }
4988 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4989 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4990 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4991 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4992
4993 RealArgs.push_back(
4994 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4995 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4996 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4997 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4998 }
4999 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5000
5001 Builder.CreateCall(RTLFn, RealArgs);
5002}
5003
5005 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5006 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5007 WorksharingLoopType LoopType) {
5008 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5009 BasicBlock *Preheader = CLI->getPreheader();
5010 Value *TripCount = CLI->getTripCount();
5011
5012 // After loop body outling, the loop body contains only set up
5013 // of loop body argument structure and the call to the outlined
5014 // loop body function. Firstly, we need to move setup of loop body args
5015 // into loop preheader.
5016 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5017 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5018
5019 // The next step is to remove the whole loop. We do not it need anymore.
5020 // That's why make an unconditional branch from loop preheader to loop
5021 // exit block
5022 Builder.restoreIP({Preheader, Preheader->end()});
5023 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5024 Preheader->getTerminator()->eraseFromParent();
5025 Builder.CreateBr(CLI->getExit());
5026
5027 // Delete dead loop blocks
5028 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5029 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5030 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5031 CleanUpInfo.EntryBB = CLI->getHeader();
5032 CleanUpInfo.ExitBB = CLI->getExit();
5033 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5034 DeleteDeadBlocks(BlocksToBeRemoved);
5035
5036 // Find the instruction which corresponds to loop body argument structure
5037 // and remove the call to loop body function instruction.
5038 Value *LoopBodyArg;
5039 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5040 assert(OutlinedFnUser &&
5041 "Expected unique undroppable user of outlined function");
5042 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5043 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5044 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5045 "Expected outlined function call to be located in loop preheader");
5046 // Check in case no argument structure has been passed.
5047 if (OutlinedFnCallInstruction->arg_size() > 1)
5048 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5049 else
5050 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5051 OutlinedFnCallInstruction->eraseFromParent();
5052
5053 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5054 LoopBodyArg, TripCount, OutlinedFn);
5055
5056 for (auto &ToBeDeletedItem : ToBeDeleted)
5057 ToBeDeletedItem->eraseFromParent();
5058 CLI->invalidate();
5059}
5060
5062OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5063 InsertPointTy AllocaIP,
5064 WorksharingLoopType LoopType) {
5065 uint32_t SrcLocStrSize;
5066 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5067 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5068
5069 OutlineInfo OI;
5070 OI.OuterAllocaBB = CLI->getPreheader();
5071 Function *OuterFn = CLI->getPreheader()->getParent();
5072
5073 // Instructions which need to be deleted at the end of code generation
5075
5076 OI.OuterAllocaBB = AllocaIP.getBlock();
5077
5078 // Mark the body loop as region which needs to be extracted
5079 OI.EntryBB = CLI->getBody();
5080 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5081 "omp.prelatch", true);
5082
5083 // Prepare loop body for extraction
5084 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5085
5086 // Insert new loop counter variable which will be used only in loop
5087 // body.
5088 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5089 Instruction *NewLoopCntLoad =
5090 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5091 // New loop counter instructions are redundant in the loop preheader when
5092 // code generation for workshare loop is finshed. That's why mark them as
5093 // ready for deletion.
5094 ToBeDeleted.push_back(NewLoopCntLoad);
5095 ToBeDeleted.push_back(NewLoopCnt);
5096
5097 // Analyse loop body region. Find all input variables which are used inside
5098 // loop body region.
5099 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5101 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5102
5103 CodeExtractorAnalysisCache CEAC(*OuterFn);
5104 CodeExtractor Extractor(Blocks,
5105 /* DominatorTree */ nullptr,
5106 /* AggregateArgs */ true,
5107 /* BlockFrequencyInfo */ nullptr,
5108 /* BranchProbabilityInfo */ nullptr,
5109 /* AssumptionCache */ nullptr,
5110 /* AllowVarArgs */ true,
5111 /* AllowAlloca */ true,
5112 /* AllocationBlock */ CLI->getPreheader(),
5113 /* Suffix */ ".omp_wsloop",
5114 /* AggrArgsIn0AddrSpace */ true);
5115
5116 BasicBlock *CommonExit = nullptr;
5117 SetVector<Value *> SinkingCands, HoistingCands;
5118
5119 // Find allocas outside the loop body region which are used inside loop
5120 // body
5121 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5122
5123 // We need to model loop body region as the function f(cnt, loop_arg).
5124 // That's why we replace loop induction variable by the new counter
5125 // which will be one of loop body function argument
5127 CLI->getIndVar()->user_end());
5128 for (auto Use : Users) {
5129 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5130 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5131 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5132 }
5133 }
5134 }
5135 // Make sure that loop counter variable is not merged into loop body
5136 // function argument structure and it is passed as separate variable
5137 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5138
5139 // PostOutline CB is invoked when loop body function is outlined and
5140 // loop body is replaced by call to outlined function. We need to add
5141 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5142 // function will handle loop control logic.
5143 //
5144 OI.PostOutlineCB = [=, ToBeDeletedVec =
5145 std::move(ToBeDeleted)](Function &OutlinedFn) {
5146 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5147 LoopType);
5148 };
5149 addOutlineInfo(std::move(OI));
5150 return CLI->getAfterIP();
5151}
5152
5155 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5156 bool HasSimdModifier, bool HasMonotonicModifier,
5157 bool HasNonmonotonicModifier, bool HasOrderedClause,
5158 WorksharingLoopType LoopType) {
5159 if (Config.isTargetDevice())
5160 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5161 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5162 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5163 HasNonmonotonicModifier, HasOrderedClause);
5164
5165 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5166 OMPScheduleType::ModifierOrdered;
5167 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5168 case OMPScheduleType::BaseStatic:
5169 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5170 if (IsOrdered)
5171 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5172 NeedsBarrier, ChunkSize);
5173 // FIXME: Monotonicity ignored?
5174 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5175
5176 case OMPScheduleType::BaseStaticChunked:
5177 if (IsOrdered)
5178 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5179 NeedsBarrier, ChunkSize);
5180 // FIXME: Monotonicity ignored?
5181 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5182 ChunkSize);
5183
5184 case OMPScheduleType::BaseRuntime:
5185 case OMPScheduleType::BaseAuto:
5186 case OMPScheduleType::BaseGreedy:
5187 case OMPScheduleType::BaseBalanced:
5188 case OMPScheduleType::BaseSteal:
5189 case OMPScheduleType::BaseGuidedSimd:
5190 case OMPScheduleType::BaseRuntimeSimd:
5191 assert(!ChunkSize &&
5192 "schedule type does not support user-defined chunk sizes");
5193 [[fallthrough]];
5194 case OMPScheduleType::BaseDynamicChunked:
5195 case OMPScheduleType::BaseGuidedChunked:
5196 case OMPScheduleType::BaseGuidedIterativeChunked:
5197 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5198 case OMPScheduleType::BaseStaticBalancedChunked:
5199 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5200 NeedsBarrier, ChunkSize);
5201
5202 default:
5203 llvm_unreachable("Unknown/unimplemented schedule kind");
5204 }
5205}
5206
5207/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5208/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5209/// the runtime. Always interpret integers as unsigned similarly to
5210/// CanonicalLoopInfo.
5211static FunctionCallee
5213 unsigned Bitwidth = Ty->getIntegerBitWidth();
5214 if (Bitwidth == 32)
5215 return OMPBuilder.getOrCreateRuntimeFunction(
5216 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5217 if (Bitwidth == 64)
5218 return OMPBuilder.getOrCreateRuntimeFunction(
5219 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5220 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5221}
5222
5223/// Returns an LLVM function to call for updating the next loop using OpenMP
5224/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5225/// the runtime. Always interpret integers as unsigned similarly to
5226/// CanonicalLoopInfo.
5227static FunctionCallee
5229 unsigned Bitwidth = Ty->getIntegerBitWidth();
5230 if (Bitwidth == 32)
5231 return OMPBuilder.getOrCreateRuntimeFunction(
5232 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5233 if (Bitwidth == 64)
5234 return OMPBuilder.getOrCreateRuntimeFunction(
5235 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5236 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5237}
5238
5239/// Returns an LLVM function to call for finalizing the dynamic loop using
5240/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5241/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5242static FunctionCallee
5244 unsigned Bitwidth = Ty->getIntegerBitWidth();
5245 if (Bitwidth == 32)
5246 return OMPBuilder.getOrCreateRuntimeFunction(
5247 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5248 if (Bitwidth == 64)
5249 return OMPBuilder.getOrCreateRuntimeFunction(
5250 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5251 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5252}
5253
5255OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5256 InsertPointTy AllocaIP,
5257 OMPScheduleType SchedType,
5258 bool NeedsBarrier, Value *Chunk) {
5259 assert(CLI->isValid() && "Requires a valid canonical loop");
5260 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5261 "Require dedicated allocate IP");
5263 "Require valid schedule type");
5264
5265 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5266 OMPScheduleType::ModifierOrdered;
5267
5268 // Set up the source location value for OpenMP runtime.
5270
5271 uint32_t SrcLocStrSize;
5272 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5273 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5274
5275 // Declare useful OpenMP runtime functions.
5276 Value *IV = CLI->getIndVar();
5277 Type *IVTy = IV->getType();
5278 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5279 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5280
5281 // Allocate space for computed loop bounds as expected by the "init" function.
5282 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5283 Type *I32Type = Type::getInt32Ty(M.getContext());
5284 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5285 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5286 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5287 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5288 CLI->setLastIter(PLastIter);
5289
5290 // At the end of the preheader, prepare for calling the "init" function by
5291 // storing the current loop bounds into the allocated space. A canonical loop
5292 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5293 // and produces an inclusive upper bound.
5294 BasicBlock *PreHeader = CLI->getPreheader();
5295 Builder.SetInsertPoint(PreHeader->getTerminator());
5296 Constant *One = ConstantInt::get(IVTy, 1);
5297 Builder.CreateStore(One, PLowerBound);
5298 Value *UpperBound = CLI->getTripCount();
5299 Builder.CreateStore(UpperBound, PUpperBound);
5300 Builder.CreateStore(One, PStride);
5301
5302 BasicBlock *Header = CLI->getHeader();
5303 BasicBlock *Exit = CLI->getExit();
5304 BasicBlock *Cond = CLI->getCond();
5305 BasicBlock *Latch = CLI->getLatch();
5306 InsertPointTy AfterIP = CLI->getAfterIP();
5307
5308 // The CLI will be "broken" in the code below, as the loop is no longer
5309 // a valid canonical loop.
5310
5311 if (!Chunk)
5312 Chunk = One;
5313
5314 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5315
5316 Constant *SchedulingType =
5317 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5318
5319 // Call the "init" function.
5320 Builder.CreateCall(DynamicInit,
5321 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5322 UpperBound, /* step */ One, Chunk});
5323
5324 // An outer loop around the existing one.
5325 BasicBlock *OuterCond = BasicBlock::Create(
5326 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5327 PreHeader->getParent());
5328 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5329 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5330 Value *Res =
5331 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5332 PLowerBound, PUpperBound, PStride});
5333 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5334 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5335 Value *LowerBound =
5336 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5337 Builder.CreateCondBr(MoreWork, Header, Exit);
5338
5339 // Change PHI-node in loop header to use outer cond rather than preheader,
5340 // and set IV to the LowerBound.
5341 Instruction *Phi = &Header->front();
5342 auto *PI = cast<PHINode>(Phi);
5343 PI->setIncomingBlock(0, OuterCond);
5344 PI->setIncomingValue(0, LowerBound);
5345
5346 // Then set the pre-header to jump to the OuterCond
5347 Instruction *Term = PreHeader->getTerminator();
5348 auto *Br = cast<BranchInst>(Term);
5349 Br->setSuccessor(0, OuterCond);
5350
5351 // Modify the inner condition:
5352 // * Use the UpperBound returned from the DynamicNext call.
5353 // * jump to the loop outer loop when done with one of the inner loops.
5354 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5355 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5357 auto *CI = cast<CmpInst>(Comp);
5358 CI->setOperand(1, UpperBound);
5359 // Redirect the inner exit to branch to outer condition.
5360 Instruction *Branch = &Cond->back();
5361 auto *BI = cast<BranchInst>(Branch);
5362 assert(BI->getSuccessor(1) == Exit);
5363 BI->setSuccessor(1, OuterCond);
5364
5365 // Call the "fini" function if "ordered" is present in wsloop directive.
5366 if (Ordered) {
5367 Builder.SetInsertPoint(&Latch->back());
5368 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5369 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5370 }
5371
5372 // Add the barrier if requested.
5373 if (NeedsBarrier) {
5374 Builder.SetInsertPoint(&Exit->back());
5375 InsertPointOrErrorTy BarrierIP =
5376 createBarrier(LocationDescription(Builder.saveIP(), DL),
5377 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5378 /* CheckCancelFlag */ false);
5379 if (!BarrierIP)
5380 return BarrierIP.takeError();
5381 }
5382
5383 CLI->invalidate();
5384 return AfterIP;
5385}
5386
5387/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5388/// after this \p OldTarget will be orphaned.
5390 BasicBlock *NewTarget, DebugLoc DL) {
5391 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5392 redirectTo(Pred, NewTarget, DL);
5393}
5394
5395/// Determine which blocks in \p BBs are reachable from outside and remove the
5396/// ones that are not reachable from the function.
5399 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5400 for (Use &U : BB->uses()) {
5401 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5402 if (!UseInst)
5403 continue;
5404 if (BBsToErase.count(UseInst->getParent()))
5405 continue;
5406 return true;
5407 }
5408 return false;
5409 };
5410
5411 while (BBsToErase.remove_if(HasRemainingUses)) {
5412 // Try again if anything was removed.
5413 }
5414
5415 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5416 DeleteDeadBlocks(BBVec);
5417}
5418
5421 InsertPointTy ComputeIP) {
5422 assert(Loops.size() >= 1 && "At least one loop required");
5423 size_t NumLoops = Loops.size();
5424
5425 // Nothing to do if there is already just one loop.
5426 if (NumLoops == 1)
5427 return Loops.front();
5428
5429 CanonicalLoopInfo *Outermost = Loops.front();
5430 CanonicalLoopInfo *Innermost = Loops.back();
5431 BasicBlock *OrigPreheader = Outermost->getPreheader();
5432 BasicBlock *OrigAfter = Outermost->getAfter();
5433 Function *F = OrigPreheader->getParent();
5434
5435 // Loop control blocks that may become orphaned later.
5436 SmallVector<BasicBlock *, 12> OldControlBBs;
5437 OldControlBBs.reserve(6 * Loops.size());
5439 Loop->collectControlBlocks(OldControlBBs);
5440
5441 // Setup the IRBuilder for inserting the trip count computation.
5443 if (ComputeIP.isSet())
5444 Builder.restoreIP(ComputeIP);
5445 else
5446 Builder.restoreIP(Outermost->getPreheaderIP());
5447
5448 // Derive the collapsed' loop trip count.
5449 // TODO: Find common/largest indvar type.
5450 Value *CollapsedTripCount = nullptr;
5451 for (CanonicalLoopInfo *L : Loops) {
5452 assert(L->isValid() &&
5453 "All loops to collapse must be valid canonical loops");
5454 Value *OrigTripCount = L->getTripCount();
5455 if (!CollapsedTripCount) {
5456 CollapsedTripCount = OrigTripCount;
5457 continue;
5458 }
5459
5460 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5461 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5462 }
5463
5464 // Create the collapsed loop control flow.
5465 CanonicalLoopInfo *Result =
5466 createLoopSkeleton(DL, CollapsedTripCount, F,
5467 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5468
5469 // Build the collapsed loop body code.
5470 // Start with deriving the input loop induction variables from the collapsed
5471 // one, using a divmod scheme. To preserve the original loops' order, the
5472 // innermost loop use the least significant bits.
5473 Builder.restoreIP(Result->getBodyIP());
5474
5475 Value *Leftover = Result->getIndVar();
5476 SmallVector<Value *> NewIndVars;
5477 NewIndVars.resize(NumLoops);
5478 for (int i = NumLoops - 1; i >= 1; --i) {
5479 Value *OrigTripCount = Loops[i]->getTripCount();
5480
5481 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5482 NewIndVars[i] = NewIndVar;
5483
5484 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5485 }
5486 // Outermost loop gets all the remaining bits.
5487 NewIndVars[0] = Leftover;
5488
5489 // Construct the loop body control flow.
5490 // We progressively construct the branch structure following in direction of
5491 // the control flow, from the leading in-between code, the loop nest body, the
5492 // trailing in-between code, and rejoining the collapsed loop's latch.
5493 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5494 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5495 // its predecessors as sources.
5496 BasicBlock *ContinueBlock = Result->getBody();
5497 BasicBlock *ContinuePred = nullptr;
5498 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5499 BasicBlock *NextSrc) {
5500 if (ContinueBlock)
5501 redirectTo(ContinueBlock, Dest, DL);
5502 else
5503 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5504
5505 ContinueBlock = nullptr;
5506 ContinuePred = NextSrc;
5507 };
5508
5509 // The code before the nested loop of each level.
5510 // Because we are sinking it into the nest, it will be executed more often
5511 // that the original loop. More sophisticated schemes could keep track of what
5512 // the in-between code is and instantiate it only once per thread.
5513 for (size_t i = 0; i < NumLoops - 1; ++i)
5514 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5515
5516 // Connect the loop nest body.
5517 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5518
5519 // The code after the nested loop at each level.
5520 for (size_t i = NumLoops - 1; i > 0; --i)
5521 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5522
5523 // Connect the finished loop to the collapsed loop latch.
5524 ContinueWith(Result->getLatch(), nullptr);
5525
5526 // Replace the input loops with the new collapsed loop.
5527 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5528 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5529
5530 // Replace the input loop indvars with the derived ones.
5531 for (size_t i = 0; i < NumLoops; ++i)
5532 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5533
5534 // Remove unused parts of the input loops.
5535 removeUnusedBlocksFromParent(OldControlBBs);
5536
5537 for (CanonicalLoopInfo *L : Loops)
5538 L->invalidate();
5539
5540#ifndef NDEBUG
5541 Result->assertOK();
5542#endif
5543 return Result;
5544}
5545
5546std::vector<CanonicalLoopInfo *>
5548 ArrayRef<Value *> TileSizes) {
5549 assert(TileSizes.size() == Loops.size() &&
5550 "Must pass as many tile sizes as there are loops");
5551 int NumLoops = Loops.size();
5552 assert(NumLoops >= 1 && "At least one loop to tile required");
5553
5554 CanonicalLoopInfo *OutermostLoop = Loops.front();
5555 CanonicalLoopInfo *InnermostLoop = Loops.back();
5556 Function *F = OutermostLoop->getBody()->getParent();
5557 BasicBlock *InnerEnter = InnermostLoop->getBody();
5558 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5559
5560 // Loop control blocks that may become orphaned later.
5561 SmallVector<BasicBlock *, 12> OldControlBBs;
5562 OldControlBBs.reserve(6 * Loops.size());
5564 Loop->collectControlBlocks(OldControlBBs);
5565
5566 // Collect original trip counts and induction variable to be accessible by
5567 // index. Also, the structure of the original loops is not preserved during
5568 // the construction of the tiled loops, so do it before we scavenge the BBs of
5569 // any original CanonicalLoopInfo.
5570 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5571 for (CanonicalLoopInfo *L : Loops) {
5572 assert(L->isValid() && "All input loops must be valid canonical loops");
5573 OrigTripCounts.push_back(L->getTripCount());
5574 OrigIndVars.push_back(L->getIndVar());
5575 }
5576
5577 // Collect the code between loop headers. These may contain SSA definitions
5578 // that are used in the loop nest body. To be usable with in the innermost
5579 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5580 // these instructions may be executed more often than before the tiling.
5581 // TODO: It would be sufficient to only sink them into body of the
5582 // corresponding tile loop.
5584 for (int i = 0; i < NumLoops - 1; ++i) {
5585 CanonicalLoopInfo *Surrounding = Loops[i];
5586 CanonicalLoopInfo *Nested = Loops[i + 1];
5587
5588 BasicBlock *EnterBB = Surrounding->getBody();
5589 BasicBlock *ExitBB = Nested->getHeader();
5590 InbetweenCode.emplace_back(EnterBB, ExitBB);
5591 }
5592
5593 // Compute the trip counts of the floor loops.
5595 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5596 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5597 for (int i = 0; i < NumLoops; ++i) {
5598 Value *TileSize = TileSizes[i];
5599 Value *OrigTripCount = OrigTripCounts[i];
5600 Type *IVType = OrigTripCount->getType();
5601
5602 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5603 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5604
5605 // 0 if tripcount divides the tilesize, 1 otherwise.
5606 // 1 means we need an additional iteration for a partial tile.
5607 //
5608 // Unfortunately we cannot just use the roundup-formula
5609 // (tripcount + tilesize - 1)/tilesize
5610 // because the summation might overflow. We do not want introduce undefined
5611 // behavior when the untiled loop nest did not.
5612 Value *FloorTripOverflow =
5613 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5614
5615 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5616 Value *FloorTripCount =
5617 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5618 "omp_floor" + Twine(i) + ".tripcount", true);
5619
5620 // Remember some values for later use.
5621 FloorCompleteCount.push_back(FloorCompleteTripCount);
5622 FloorCount.push_back(FloorTripCount);
5623 FloorRems.push_back(FloorTripRem);
5624 }
5625
5626 // Generate the new loop nest, from the outermost to the innermost.
5627 std::vector<CanonicalLoopInfo *> Result;
5628 Result.reserve(NumLoops * 2);
5629
5630 // The basic block of the surrounding loop that enters the nest generated
5631 // loop.
5632 BasicBlock *Enter = OutermostLoop->getPreheader();
5633
5634 // The basic block of the surrounding loop where the inner code should
5635 // continue.
5636 BasicBlock *Continue = OutermostLoop->getAfter();
5637
5638 // Where the next loop basic block should be inserted.
5639 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5640
5641 auto EmbeddNewLoop =
5642 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5643 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5644 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5645 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5646 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5647 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5648
5649 // Setup the position where the next embedded loop connects to this loop.
5650 Enter = EmbeddedLoop->getBody();
5651 Continue = EmbeddedLoop->getLatch();
5652 OutroInsertBefore = EmbeddedLoop->getLatch();
5653 return EmbeddedLoop;
5654 };
5655
5656 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5657 const Twine &NameBase) {
5658 for (auto P : enumerate(TripCounts)) {
5659 CanonicalLoopInfo *EmbeddedLoop =
5660 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5661 Result.push_back(EmbeddedLoop);
5662 }
5663 };
5664
5665 EmbeddNewLoops(FloorCount, "floor");
5666
5667 // Within the innermost floor loop, emit the code that computes the tile
5668 // sizes.
5670 SmallVector<Value *, 4> TileCounts;
5671 for (int i = 0; i < NumLoops; ++i) {
5672 CanonicalLoopInfo *FloorLoop = Result[i];
5673 Value *TileSize = TileSizes[i];
5674
5675 Value *FloorIsEpilogue =
5676 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5677 Value *TileTripCount =
5678 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5679
5680 TileCounts.push_back(TileTripCount);
5681 }
5682
5683 // Create the tile loops.
5684 EmbeddNewLoops(TileCounts, "tile");
5685
5686 // Insert the inbetween code into the body.
5687 BasicBlock *BodyEnter = Enter;
5688 BasicBlock *BodyEntered = nullptr;
5689 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5690 BasicBlock *EnterBB = P.first;
5691 BasicBlock *ExitBB = P.second;
5692
5693 if (BodyEnter)
5694 redirectTo(BodyEnter, EnterBB, DL);
5695 else
5696 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5697
5698 BodyEnter = nullptr;
5699 BodyEntered = ExitBB;
5700 }
5701
5702 // Append the original loop nest body into the generated loop nest body.
5703 if (BodyEnter)
5704 redirectTo(BodyEnter, InnerEnter, DL);
5705 else
5706 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5708
5709 // Replace the original induction variable with an induction variable computed
5710 // from the tile and floor induction variables.
5711 Builder.restoreIP(Result.back()->getBodyIP());
5712 for (int i = 0; i < NumLoops; ++i) {
5713 CanonicalLoopInfo *FloorLoop = Result[i];
5714 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5715 Value *OrigIndVar = OrigIndVars[i];
5716 Value *Size = TileSizes[i];
5717
5718 Value *Scale =
5719 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5720 Value *Shift =
5721 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5722 OrigIndVar->replaceAllUsesWith(Shift);
5723 }
5724
5725 // Remove unused parts of the original loops.
5726 removeUnusedBlocksFromParent(OldControlBBs);
5727
5728 for (CanonicalLoopInfo *L : Loops)
5729 L->invalidate();
5730
5731#ifndef NDEBUG
5732 for (CanonicalLoopInfo *GenL : Result)
5733 GenL->assertOK();
5734#endif
5735 return Result;
5736}
5737
5738/// Attach metadata \p Properties to the basic block described by \p BB. If the
5739/// basic block already has metadata, the basic block properties are appended.
5741 ArrayRef<Metadata *> Properties) {
5742 // Nothing to do if no property to attach.
5743 if (Properties.empty())
5744 return;
5745
5746 LLVMContext &Ctx = BB->getContext();
5747 SmallVector<Metadata *> NewProperties;
5748 NewProperties.push_back(nullptr);
5749
5750 // If the basic block already has metadata, prepend it to the new metadata.
5751 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5752 if (Existing)
5753 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5754
5755 append_range(NewProperties, Properties);
5756 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5757 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5758
5759 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5760}
5761
5762/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5763/// loop already has metadata, the loop properties are appended.
5765 ArrayRef<Metadata *> Properties) {
5766 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5767
5768 // Attach metadata to the loop's latch
5769 BasicBlock *Latch = Loop->getLatch();
5770 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5771 addBasicBlockMetadata(Latch, Properties);
5772}
5773
5774/// Attach llvm.access.group metadata to the memref instructions of \p Block
5775static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5776 LoopInfo &LI) {
5777 for (Instruction &I : *Block) {
5778 if (I.mayReadOrWriteMemory()) {
5779 // TODO: This instruction may already have access group from
5780 // other pragmas e.g. #pragma clang loop vectorize. Append
5781 // so that the existing metadata is not overwritten.
5782 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5783 }
5784 }
5785}
5786
5790 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5791 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5792}
5793
5797 Loop, {
5798 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5799 });
5800}
5801
5802void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5803 Value *IfCond, ValueToValueMapTy &VMap,
5804 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5805 const Twine &NamePrefix) {
5806 Function *F = CanonicalLoop->getFunction();
5807
5808 // We can't do
5809 // if (cond) {
5810 // simd_loop;
5811 // } else {
5812 // non_simd_loop;
5813 // }
5814 // because then the CanonicalLoopInfo would only point to one of the loops:
5815 // leading to other constructs operating on the same loop to malfunction.
5816 // Instead generate
5817 // while (...) {
5818 // if (cond) {
5819 // simd_body;
5820 // } else {
5821 // not_simd_body;
5822 // }
5823 // }
5824 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5825 // body at -O3
5826
5827 // Define where if branch should be inserted
5828 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5829
5830 // Create additional blocks for the if statement
5831 BasicBlock *Cond = SplitBeforeIt->getParent();
5832 llvm::LLVMContext &C = Cond->getContext();
5834 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5836 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5837
5838 // Create if condition branch.
5839 Builder.SetInsertPoint(SplitBeforeIt);
5840 Instruction *BrInstr =
5841 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5842 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5843 // Then block contains branch to omp loop body which needs to be vectorized
5844 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5845 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5846
5847 Builder.SetInsertPoint(ElseBlock);
5848
5849 // Clone loop for the else branch
5851
5852 SmallVector<BasicBlock *, 8> ExistingBlocks;
5853 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5854 ExistingBlocks.push_back(ThenBlock);
5855 ExistingBlocks.append(L->block_begin(), L->block_end());
5856 // Cond is the block that has the if clause condition
5857 // LoopCond is omp_loop.cond
5858 // LoopHeader is omp_loop.header
5859 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5860 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5861 assert(LoopCond && LoopHeader && "Invalid loop structure");
5862 for (BasicBlock *Block : ExistingBlocks) {
5863 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5864 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5865 continue;
5866 }
5867 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5868
5869 // fix name not to be omp.if.then
5870 if (Block == ThenBlock)
5871 NewBB->setName(NamePrefix + ".if.else");
5872
5873 NewBB->moveBefore(CanonicalLoop->getExit());
5874 VMap[Block] = NewBB;
5875 NewBlocks.push_back(NewBB);
5876 }
5877 remapInstructionsInBlocks(NewBlocks, VMap);
5878 Builder.CreateBr(NewBlocks.front());
5879
5880 // The loop latch must have only one predecessor. Currently it is branched to
5881 // from both the 'then' and 'else' branches.
5882 L->getLoopLatch()->splitBasicBlock(
5883 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5884
5885 // Ensure that the then block is added to the loop so we add the attributes in
5886 // the next step
5887 L->addBasicBlockToLoop(ThenBlock, LI);
5888}
5889
5890unsigned
5892 const StringMap<bool> &Features) {
5893 if (TargetTriple.isX86()) {
5894 if (Features.lookup("avx512f"))
5895 return 512;
5896 else if (Features.lookup("avx"))
5897 return 256;
5898 return 128;
5899 }
5900 if (TargetTriple.isPPC())
5901 return 128;
5902 if (TargetTriple.isWasm())
5903 return 128;
5904 return 0;
5905}
5906
5908 MapVector<Value *, Value *> AlignedVars,
5909 Value *IfCond, OrderKind Order,
5910 ConstantInt *Simdlen, ConstantInt *Safelen) {
5912
5913 Function *F = CanonicalLoop->getFunction();
5914
5915 // TODO: We should not rely on pass manager. Currently we use pass manager
5916 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5917 // object. We should have a method which returns all blocks between
5918 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5920 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5921 FAM.registerPass([]() { return LoopAnalysis(); });
5922 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5923
5924 LoopAnalysis LIA;
5925 LoopInfo &&LI = LIA.run(*F, FAM);
5926
5927 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5928 if (AlignedVars.size()) {
5930 for (auto &AlignedItem : AlignedVars) {
5931 Value *AlignedPtr = AlignedItem.first;
5932 Value *Alignment = AlignedItem.second;
5933 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5934 Builder.SetInsertPoint(loadInst->getNextNode());
5935 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5936 Alignment);
5937 }
5938 Builder.restoreIP(IP);
5939 }
5940
5941 if (IfCond) {
5942 ValueToValueMapTy VMap;
5943 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5944 }
5945
5947
5948 // Get the basic blocks from the loop in which memref instructions
5949 // can be found.
5950 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5951 // preferably without running any passes.
5952 for (BasicBlock *Block : L->getBlocks()) {
5953 if (Block == CanonicalLoop->getCond() ||
5954 Block == CanonicalLoop->getHeader())
5955 continue;
5956 Reachable.insert(Block);
5957 }
5958
5959 SmallVector<Metadata *> LoopMDList;
5960
5961 // In presence of finite 'safelen', it may be unsafe to mark all
5962 // the memory instructions parallel, because loop-carried
5963 // dependences of 'safelen' iterations are possible.
5964 // If clause order(concurrent) is specified then the memory instructions
5965 // are marked parallel even if 'safelen' is finite.
5966 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5967 // Add access group metadata to memory-access instructions.
5968 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5969 for (BasicBlock *BB : Reachable)
5970 addSimdMetadata(BB, AccessGroup, LI);
5971 // TODO: If the loop has existing parallel access metadata, have
5972 // to combine two lists.
5973 LoopMDList.push_back(MDNode::get(
5974 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5975 }
5976
5977 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5978 // versions so we can't add the loop attributes in that case.
5979 if (IfCond) {
5980 // we can still add llvm.loop.parallel_access
5981 addLoopMetadata(CanonicalLoop, LoopMDList);
5982 return;
5983 }
5984
5985 // Use the above access group metadata to create loop level
5986 // metadata, which should be distinct for each loop.
5987 ConstantAsMetadata *BoolConst =
5989 LoopMDList.push_back(MDNode::get(
5990 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5991
5992 if (Simdlen || Safelen) {
5993 // If both simdlen and safelen clauses are specified, the value of the
5994 // simdlen parameter must be less than or equal to the value of the safelen
5995 // parameter. Therefore, use safelen only in the absence of simdlen.
5996 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5997 LoopMDList.push_back(
5998 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5999 ConstantAsMetadata::get(VectorizeWidth)}));
6000 }
6001
6002 addLoopMetadata(CanonicalLoop, LoopMDList);
6003}
6004
6005/// Create the TargetMachine object to query the backend for optimization
6006/// preferences.
6007///
6008/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6009/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6010/// needed for the LLVM pass pipline. We use some default options to avoid
6011/// having to pass too many settings from the frontend that probably do not
6012/// matter.
6013///
6014/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6015/// method. If we are going to use TargetMachine for more purposes, especially
6016/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6017/// might become be worth requiring front-ends to pass on their TargetMachine,
6018/// or at least cache it between methods. Note that while fontends such as Clang
6019/// have just a single main TargetMachine per translation unit, "target-cpu" and
6020/// "target-features" that determine the TargetMachine are per-function and can
6021/// be overrided using __attribute__((target("OPTIONS"))).
6022static std::unique_ptr<TargetMachine>
6024 Module *M = F->getParent();
6025
6026 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6027 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6028 const llvm::Triple &Triple = M->getTargetTriple();
6029
6030 std::string Error;
6032 if (!TheTarget)
6033 return {};
6034
6036 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6037 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6038 /*CodeModel=*/std::nullopt, OptLevel));
6039}
6040
6041/// Heuristically determine the best-performant unroll factor for \p CLI. This
6042/// depends on the target processor. We are re-using the same heuristics as the
6043/// LoopUnrollPass.
6045 Function *F = CLI->getFunction();
6046
6047 // Assume the user requests the most aggressive unrolling, even if the rest of
6048 // the code is optimized using a lower setting.
6050 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6051
6053 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6054 FAM.registerPass([]() { return AssumptionAnalysis(); });
6055 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6056 FAM.registerPass([]() { return LoopAnalysis(); });
6057 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6058 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6059 TargetIRAnalysis TIRA;
6060 if (TM)
6061 TIRA = TargetIRAnalysis(
6062 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6063 FAM.registerPass([&]() { return TIRA; });
6064
6065 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6067 ScalarEvolution &&SE = SEA.run(*F, FAM);
6069 DominatorTree &&DT = DTA.run(*F, FAM);
6070 LoopAnalysis LIA;
6071 LoopInfo &&LI = LIA.run(*F, FAM);
6073 AssumptionCache &&AC = ACT.run(*F, FAM);
6075
6076 Loop *L = LI.getLoopFor(CLI->getHeader());
6077 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6078
6080 L, SE, TTI,
6081 /*BlockFrequencyInfo=*/nullptr,
6082 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6083 /*UserThreshold=*/std::nullopt,
6084 /*UserCount=*/std::nullopt,
6085 /*UserAllowPartial=*/true,
6086 /*UserAllowRuntime=*/true,
6087 /*UserUpperBound=*/std::nullopt,
6088 /*UserFullUnrollMaxCount=*/std::nullopt);
6089
6090 UP.Force = true;
6091
6092 // Account for additional optimizations taking place before the LoopUnrollPass
6093 // would unroll the loop.
6096
6097 // Use normal unroll factors even if the rest of the code is optimized for
6098 // size.
6101
6102 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6103 << " Threshold=" << UP.Threshold << "\n"
6104 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6105 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6106 << " PartialOptSizeThreshold="
6107 << UP.PartialOptSizeThreshold << "\n");
6108
6109 // Disable peeling.
6112 /*UserAllowPeeling=*/false,
6113 /*UserAllowProfileBasedPeeling=*/false,
6114 /*UnrollingSpecficValues=*/false);
6115
6117 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6118
6119 // Assume that reads and writes to stack variables can be eliminated by
6120 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6121 // size.
6122 for (BasicBlock *BB : L->blocks()) {
6123 for (Instruction &I : *BB) {
6124 Value *Ptr;
6125 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6126 Ptr = Load->getPointerOperand();
6127 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6128 Ptr = Store->getPointerOperand();
6129 } else
6130 continue;
6131
6132 Ptr = Ptr->stripPointerCasts();
6133
6134 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6135 if (Alloca->getParent() == &F->getEntryBlock())
6136 EphValues.insert(&I);
6137 }
6138 }
6139 }
6140
6141 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6142
6143 // Loop is not unrollable if the loop contains certain instructions.
6144 if (!UCE.canUnroll()) {
6145 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6146 return 1;
6147 }
6148
6149 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6150 << "\n");
6151
6152 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6153 // be able to use it.
6154 int TripCount = 0;
6155 int MaxTripCount = 0;
6156 bool MaxOrZero = false;
6157 unsigned TripMultiple = 0;
6158
6159 bool UseUpperBound = false;
6160 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6161 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6162 UseUpperBound);
6163 unsigned Factor = UP.Count;
6164 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6165
6166 // This function returns 1 to signal to not unroll a loop.
6167 if (Factor == 0)
6168 return 1;
6169 return Factor;
6170}
6171
6173 int32_t Factor,
6174 CanonicalLoopInfo **UnrolledCLI) {
6175 assert(Factor >= 0 && "Unroll factor must not be negative");
6176
6177 Function *F = Loop->getFunction();
6178 LLVMContext &Ctx = F->getContext();
6179
6180 // If the unrolled loop is not used for another loop-associated directive, it
6181 // is sufficient to add metadata for the LoopUnrollPass.
6182 if (!UnrolledCLI) {
6183 SmallVector<Metadata *, 2> LoopMetadata;
6184 LoopMetadata.push_back(
6185 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6186
6187 if (Factor >= 1) {
6189 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6190 LoopMetadata.push_back(MDNode::get(
6191 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6192 }
6193
6194 addLoopMetadata(Loop, LoopMetadata);
6195 return;
6196 }
6197
6198 // Heuristically determine the unroll factor.
6199 if (Factor == 0)
6201
6202 // No change required with unroll factor 1.
6203 if (Factor == 1) {
6204 *UnrolledCLI = Loop;
6205 return;
6206 }
6207
6208 assert(Factor >= 2 &&
6209 "unrolling only makes sense with a factor of 2 or larger");
6210
6211 Type *IndVarTy = Loop->getIndVarType();
6212
6213 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6214 // unroll the inner loop.
6215 Value *FactorVal =
6216 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6217 /*isSigned=*/false));
6218 std::vector<CanonicalLoopInfo *> LoopNest =
6219 tileLoops(DL, {Loop}, {FactorVal});
6220 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6221 *UnrolledCLI = LoopNest[0];
6222 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6223
6224 // LoopUnrollPass can only fully unroll loops with constant trip count.
6225 // Unroll by the unroll factor with a fallback epilog for the remainder
6226 // iterations if necessary.
6228 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6230 InnerLoop,
6231 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6233 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6234
6235#ifndef NDEBUG
6236 (*UnrolledCLI)->assertOK();
6237#endif
6238}
6239
6242 llvm::Value *BufSize, llvm::Value *CpyBuf,
6243 llvm::Value *CpyFn, llvm::Value *DidIt) {
6244 if (!updateToLocation(Loc))
6245 return Loc.IP;
6246
6247 uint32_t SrcLocStrSize;
6248 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6249 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6250 Value *ThreadId = getOrCreateThreadID(Ident);
6251
6252 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6253
6254 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6255
6256 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6257 Builder.CreateCall(Fn, Args);
6258
6259 return Builder.saveIP();
6260}
6261
6263 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6264 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6266
6267 if (!updateToLocation(Loc))
6268 return Loc.IP;
6269
6270 // If needed allocate and initialize `DidIt` with 0.
6271 // DidIt: flag variable: 1=single thread; 0=not single thread.
6272 llvm::Value *DidIt = nullptr;
6273 if (!CPVars.empty()) {
6276 }
6277
6278 Directive OMPD = Directive::OMPD_single;
6279 uint32_t SrcLocStrSize;
6280 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6281 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6282 Value *ThreadId = getOrCreateThreadID(Ident);
6283 Value *Args[] = {Ident, ThreadId};
6284
6285 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6286 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6287
6288 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6289 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6290
6291 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6292 if (Error Err = FiniCB(IP))
6293 return Err;
6294
6295 // The thread that executes the single region must set `DidIt` to 1.
6296 // This is used by __kmpc_copyprivate, to know if the caller is the
6297 // single thread or not.
6298 if (DidIt)
6300
6301 return Error::success();
6302 };
6303
6304 // generates the following:
6305 // if (__kmpc_single()) {
6306 // .... single region ...
6307 // __kmpc_end_single
6308 // }
6309 // __kmpc_copyprivate
6310 // __kmpc_barrier
6311
6312 InsertPointOrErrorTy AfterIP =
6313 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6314 /*Conditional*/ true,
6315 /*hasFinalize*/ true);
6316 if (!AfterIP)
6317 return AfterIP.takeError();
6318
6319 if (DidIt) {
6320 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6321 // NOTE BufSize is currently unused, so just pass 0.
6323 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6324 CPFuncs[I], DidIt);
6325 // NOTE __kmpc_copyprivate already inserts a barrier
6326 } else if (!IsNowait) {
6327 InsertPointOrErrorTy AfterIP =
6329 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6330 /* CheckCancelFlag */ false);
6331 if (!AfterIP)
6332 return AfterIP.takeError();
6333 }
6334 return Builder.saveIP();
6335}
6336
6338 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6339 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6340
6341 if (!updateToLocation(Loc))
6342 return Loc.IP;
6343
6344 Directive OMPD = Directive::OMPD_critical;
6345 uint32_t SrcLocStrSize;
6346 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6347 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6348 Value *ThreadId = getOrCreateThreadID(Ident);
6349 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6350 Value *Args[] = {Ident, ThreadId, LockVar};
6351
6352 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6353 Function *RTFn = nullptr;
6354 if (HintInst) {
6355 // Add Hint to entry Args and create call
6356 EnterArgs.push_back(HintInst);
6357 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6358 } else {
6359 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6360 }
6361 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6362
6363 Function *ExitRTLFn =
6364 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6365 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6366
6367 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6368 /*Conditional*/ false, /*hasFinalize*/ true);
6369}
6370
6373 InsertPointTy AllocaIP, unsigned NumLoops,
6374 ArrayRef<llvm::Value *> StoreValues,
6375 const Twine &Name, bool IsDependSource) {
6376 assert(
6377 llvm::all_of(StoreValues,
6378 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6379 "OpenMP runtime requires depend vec with i64 type");
6380
6381 if (!updateToLocation(Loc))
6382 return Loc.IP;
6383
6384 // Allocate space for vector and generate alloc instruction.
6385 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6386 Builder.restoreIP(AllocaIP);
6387 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6388 ArgsBase->setAlignment(Align(8));
6389 updateToLocation(Loc);
6390
6391 // Store the index value with offset in depend vector.
6392 for (unsigned I = 0; I < NumLoops; ++I) {
6393 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6394 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6395 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6396 STInst->setAlignment(Align(8));
6397 }
6398
6399 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6400 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6401
6402 uint32_t SrcLocStrSize;
6403 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6404 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6405 Value *ThreadId = getOrCreateThreadID(Ident);
6406 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6407
6408 Function *RTLFn = nullptr;
6409 if (IsDependSource)
6410 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6411 else
6412 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6413 Builder.CreateCall(RTLFn, Args);
6414
6415 return Builder.saveIP();
6416}
6417
6419 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6420 FinalizeCallbackTy FiniCB, bool IsThreads) {
6421 if (!updateToLocation(Loc))
6422 return Loc.IP;
6423
6424 Directive OMPD = Directive::OMPD_ordered;
6425 Instruction *EntryCall = nullptr;
6426 Instruction *ExitCall = nullptr;
6427
6428 if (IsThreads) {
6429 uint32_t SrcLocStrSize;
6430 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6431 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6432 Value *ThreadId = getOrCreateThreadID(Ident);
6433 Value *Args[] = {Ident, ThreadId};
6434
6435 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6436 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6437
6438 Function *ExitRTLFn =
6439 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6440 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6441 }
6442
6443 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6444 /*Conditional*/ false, /*hasFinalize*/ true);
6445}
6446
6447OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6448 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6449 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6450 bool HasFinalize, bool IsCancellable) {
6451
6452 if (HasFinalize)
6453 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6454
6455 // Create inlined region's entry and body blocks, in preparation
6456 // for conditional creation
6457 BasicBlock *EntryBB = Builder.GetInsertBlock();
6458 Instruction *SplitPos = EntryBB->getTerminator();
6459 if (!isa_and_nonnull<BranchInst>(SplitPos))
6460 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6461 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6462 BasicBlock *FiniBB =
6463 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6464
6466 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6467
6468 // generate body
6469 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6470 /* CodeGenIP */ Builder.saveIP()))
6471 return Err;
6472
6473 // emit exit call and do any needed finalization.
6474 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6475 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6476 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6477 "Unexpected control flow graph state!!");
6478 InsertPointOrErrorTy AfterIP =
6479 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6480 if (!AfterIP)
6481 return AfterIP.takeError();
6482 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6483 "Unexpected Control Flow State!");
6485
6486 // If we are skipping the region of a non conditional, remove the exit
6487 // block, and clear the builder's insertion point.
6488 assert(SplitPos->getParent() == ExitBB &&
6489 "Unexpected Insertion point location!");
6490 auto merged = MergeBlockIntoPredecessor(ExitBB);
6491 BasicBlock *ExitPredBB = SplitPos->getParent();
6492 auto InsertBB = merged ? ExitPredBB : ExitBB;
6493 if (!isa_and_nonnull<BranchInst>(SplitPos))
6494 SplitPos->eraseFromParent();
6495 Builder.SetInsertPoint(InsertBB);
6496
6497 return Builder.saveIP();
6498}
6499
6500OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6501 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6502 // if nothing to do, Return current insertion point.
6503 if (!Conditional || !EntryCall)
6504 return Builder.saveIP();
6505
6506 BasicBlock *EntryBB = Builder.GetInsertBlock();
6507 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6508 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6509 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6510
6511 // Emit thenBB and set the Builder's insertion point there for
6512 // body generation next. Place the block after the current block.
6513 Function *CurFn = EntryBB->getParent();
6514 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6515
6516 // Move Entry branch to end of ThenBB, and replace with conditional
6517 // branch (If-stmt)
6518 Instruction *EntryBBTI = EntryBB->getTerminator();
6519 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6520 EntryBBTI->removeFromParent();
6522 Builder.Insert(EntryBBTI);
6523 UI->eraseFromParent();
6525
6526 // return an insertion point to ExitBB.
6527 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6528}
6529
6530OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6531 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6532 bool HasFinalize) {
6533
6534 Builder.restoreIP(FinIP);
6535
6536 // If there is finalization to do, emit it before the exit call
6537 if (HasFinalize) {
6538 assert(!FinalizationStack.empty() &&
6539 "Unexpected finalization stack state!");
6540
6541 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6542 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6543
6544 if (Error Err = Fi.FiniCB(FinIP))
6545 return Err;
6546
6547 BasicBlock *FiniBB = FinIP.getBlock();
6548 Instruction *FiniBBTI = FiniBB->getTerminator();
6549
6550 // set Builder IP for call creation
6551 Builder.SetInsertPoint(FiniBBTI);
6552 }
6553
6554 if (!ExitCall)
6555 return Builder.saveIP();
6556
6557 // place the Exitcall as last instruction before Finalization block terminator
6558 ExitCall->removeFromParent();
6559 Builder.Insert(ExitCall);
6560
6561 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6562 ExitCall->getIterator());
6563}
6564
6566 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6567 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6568 if (!IP.isSet())
6569 return IP;
6570
6572
6573 // creates the following CFG structure
6574 // OMP_Entry : (MasterAddr != PrivateAddr)?
6575 // F T
6576 // | \
6577 // | copin.not.master
6578 // | /
6579 // v /
6580 // copyin.not.master.end
6581 // |
6582 // v
6583 // OMP.Entry.Next
6584
6585 BasicBlock *OMP_Entry = IP.getBlock();
6586 Function *CurFn = OMP_Entry->getParent();
6587 BasicBlock *CopyBegin =
6588 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6589 BasicBlock *CopyEnd = nullptr;
6590
6591 // If entry block is terminated, split to preserve the branch to following
6592 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6593 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6594 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6595 "copyin.not.master.end");
6596 OMP_Entry->getTerminator()->eraseFromParent();
6597 } else {
6598 CopyEnd =
6599 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6600 }
6601
6602 Builder.SetInsertPoint(OMP_Entry);
6603 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6604 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6605 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6606 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6607
6608 Builder.SetInsertPoint(CopyBegin);
6609 if (BranchtoEnd)
6611
6612 return Builder.saveIP();
6613}
6614
6616 Value *Size, Value *Allocator,
6617 std::string Name) {
6619 updateToLocation(Loc);
6620
6621 uint32_t SrcLocStrSize;
6622 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6623 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6624 Value *ThreadId = getOrCreateThreadID(Ident);
6625 Value *Args[] = {ThreadId, Size, Allocator};
6626
6627 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6628
6629 return Builder.CreateCall(Fn, Args, Name);
6630}
6631
6633 Value *Addr, Value *Allocator,
6634 std::string Name) {
6636 updateToLocation(Loc);
6637
6638 uint32_t SrcLocStrSize;
6639 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6640 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6641 Value *ThreadId = getOrCreateThreadID(Ident);
6642 Value *Args[] = {ThreadId, Addr, Allocator};
6643 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6644 return Builder.CreateCall(Fn, Args, Name);
6645}
6646
6648 const LocationDescription &Loc, Value *InteropVar,
6649 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6650 Value *DependenceAddress, bool HaveNowaitClause) {
6652 updateToLocation(Loc);
6653
6654 uint32_t SrcLocStrSize;
6655 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6656 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6657 Value *ThreadId = getOrCreateThreadID(Ident);
6658 if (Device == nullptr)
6660 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6661 if (NumDependences == nullptr) {
6662 NumDependences = ConstantInt::get(Int32, 0);
6663 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6664 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6665 }
6666 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6667 Value *Args[] = {
6668 Ident, ThreadId, InteropVar, InteropTypeVal,
6669 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6670
6671 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6672
6673 return Builder.CreateCall(Fn, Args);
6674}
6675
6677 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6678 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6680 updateToLocation(Loc);
6681
6682 uint32_t SrcLocStrSize;
6683 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6684 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6685 Value *ThreadId = getOrCreateThreadID(Ident);
6686 if (Device == nullptr)
6688 if (NumDependences == nullptr) {
6689 NumDependences = ConstantInt::get(Int32, 0);
6690 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6691 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6692 }
6693 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6694 Value *Args[] = {
6695 Ident, ThreadId, InteropVar, Device,
6696 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6697
6698 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6699
6700 return Builder.CreateCall(Fn, Args);
6701}
6702
6704 Value *InteropVar, Value *Device,
6705 Value *NumDependences,
6706 Value *DependenceAddress,
6707 bool HaveNowaitClause) {
6709 updateToLocation(Loc);
6710 uint32_t SrcLocStrSize;
6711 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6712 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6713 Value *ThreadId = getOrCreateThreadID(Ident);
6714 if (Device == nullptr)
6716 if (NumDependences == nullptr) {
6717 NumDependences = ConstantInt::get(Int32, 0);
6718 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6719 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6720 }
6721 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6722 Value *Args[] = {
6723 Ident, ThreadId, InteropVar, Device,
6724 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6725
6726 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6727
6728 return Builder.CreateCall(Fn, Args);
6729}
6730
6732 const LocationDescription &Loc, llvm::Value *Pointer,
6735 updateToLocation(Loc);
6736
6737 uint32_t SrcLocStrSize;
6738 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6739 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6740 Value *ThreadId = getOrCreateThreadID(Ident);
6741 Constant *ThreadPrivateCache =
6742 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6743 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6744
6745 Function *Fn =
6746 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6747
6748 return Builder.CreateCall(Fn, Args);
6749}
6750
6752 const LocationDescription &Loc,
6754 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6755 "expected num_threads and num_teams to be specified");
6756
6757 if (!updateToLocation(Loc))
6758 return Loc.IP;
6759
6760 uint32_t SrcLocStrSize;
6761 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6762 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6763 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6764 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6765 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6766 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6767 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6768
6769 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6770 Function *Kernel = DebugKernelWrapper;
6771
6772 // We need to strip the debug prefix to get the correct kernel name.
6773 StringRef KernelName = Kernel->getName();
6774 const std::string DebugPrefix = "_debug__";
6775 if (KernelName.ends_with(DebugPrefix)) {
6776 KernelName = KernelName.drop_back(DebugPrefix.length());
6777 Kernel = M.getFunction(KernelName);
6778 assert(Kernel && "Expected the real kernel to exist");
6779 }
6780
6781 // Manifest the launch configuration in the metadata matching the kernel
6782 // environment.
6783 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6784 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6785
6786 // If MaxThreads not set, select the maximum between the default workgroup
6787 // size and the MinThreads value.
6788 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6789 if (MaxThreadsVal < 0)
6790 MaxThreadsVal = std::max(
6791 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6792
6793 if (MaxThreadsVal > 0)
6794 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6795
6796 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6798 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6799 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6800 Constant *ReductionDataSize =
6801 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6802 Constant *ReductionBufferLength =
6803 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6804
6806 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6807 const DataLayout &DL = Fn->getDataLayout();
6808
6809 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6810 Constant *DynamicEnvironmentInitializer =
6811 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6812 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6813 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6814 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6815 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6816 DL.getDefaultGlobalsAddressSpace());
6817 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6818
6819 Constant *DynamicEnvironment =
6820 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6821 ? DynamicEnvironmentGV
6822 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6823 DynamicEnvironmentPtr);
6824
6825 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6826 ConfigurationEnvironment, {
6827 UseGenericStateMachineVal,
6828 MayUseNestedParallelismVal,
6829 IsSPMDVal,
6830 MinThreads,
6831 MaxThreads,
6832 MinTeams,
6833 MaxTeams,
6834 ReductionDataSize,
6835 ReductionBufferLength,
6836 });
6837 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6838 KernelEnvironment, {
6839 ConfigurationEnvironmentInitializer,
6840 Ident,
6841 DynamicEnvironment,
6842 });
6843 std::string KernelEnvironmentName =
6844 (KernelName + "_kernel_environment").str();
6845 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6846 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6847 KernelEnvironmentInitializer, KernelEnvironmentName,
6848 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6849 DL.getDefaultGlobalsAddressSpace());
6850 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6851
6852 Constant *KernelEnvironment =
6853 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6854 ? KernelEnvironmentGV
6855 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6856 KernelEnvironmentPtr);
6857 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6858 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6859 KernelLaunchEnvironment =
6860 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6861 ? KernelLaunchEnvironment
6862 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6863 KernelLaunchEnvParamTy);
6864 CallInst *ThreadKind =
6865 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6866
6867 Value *ExecUserCode = Builder.CreateICmpEQ(
6868 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6869 "exec_user_code");
6870
6871 // ThreadKind = __kmpc_target_init(...)
6872 // if (ThreadKind == -1)
6873 // user_code
6874 // else
6875 // return;
6876
6877 auto *UI = Builder.CreateUnreachable();
6878 BasicBlock *CheckBB = UI->getParent();
6879 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6880
6881 BasicBlock *WorkerExitBB = BasicBlock::Create(
6882 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6883 Builder.SetInsertPoint(WorkerExitBB);
6885
6886 auto *CheckBBTI = CheckBB->getTerminator();
6887 Builder.SetInsertPoint(CheckBBTI);
6888 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6889
6890 CheckBBTI->eraseFromParent();
6891 UI->eraseFromParent();
6892
6893 // Continue in the "user_code" block, see diagram above and in
6894 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6895 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6896}
6897
6899 int32_t TeamsReductionDataSize,
6900 int32_t TeamsReductionBufferLength) {
6901 if (!updateToLocation(Loc))
6902 return;
6903
6905 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6906
6907 Builder.CreateCall(Fn, {});
6908
6909 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6910 return;
6911
6913 // We need to strip the debug prefix to get the correct kernel name.
6914 StringRef KernelName = Kernel->getName();
6915 const std::string DebugPrefix = "_debug__";
6916 if (KernelName.ends_with(DebugPrefix))
6917 KernelName = KernelName.drop_back(DebugPrefix.length());
6918 auto *KernelEnvironmentGV =
6919 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6920 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6921 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6922 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6923 KernelEnvironmentInitializer,
6924 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6925 NewInitializer = ConstantFoldInsertValueInstruction(
6926 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6927 {0, 8});
6928 KernelEnvironmentGV->setInitializer(NewInitializer);
6929}
6930
6932 bool Min) {
6933 if (Kernel.hasFnAttribute(Name)) {
6934 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6935 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6936 }
6937 Kernel.addFnAttr(Name, llvm::utostr(Value));
6938}
6939
6940std::pair<int32_t, int32_t>
6942 int32_t ThreadLimit =
6943 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6944
6945 if (T.isAMDGPU()) {
6946 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6947 if (!Attr.isValid() || !Attr.isStringAttribute())
6948 return {0, ThreadLimit};
6949 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6950 int32_t LB, UB;
6951 if (!llvm::to_integer(UBStr, UB, 10))
6952 return {0, ThreadLimit};
6953 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6954 if (!llvm::to_integer(LBStr, LB, 10))
6955 return {0, UB};
6956 return {LB, UB};
6957 }
6958
6959 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6960 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6961 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6962 }
6963 return {0, ThreadLimit};
6964}
6965
6967 Function &Kernel, int32_t LB,
6968 int32_t UB) {
6969 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6970
6971 if (T.isAMDGPU()) {
6972 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6973 llvm::utostr(LB) + "," + llvm::utostr(UB));
6974 return;
6975 }
6976
6977 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6978}
6979
6980std::pair<int32_t, int32_t>
6982 // TODO: Read from backend annotations if available.
6983 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6984}
6985
6987 int32_t LB, int32_t UB) {
6988 if (T.isNVPTX())
6989 if (UB > 0)
6990 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
6991 if (T.isAMDGPU())
6992 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6993
6994 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6995}
6996
6997void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6998 Function *OutlinedFn) {
6999 if (Config.isTargetDevice()) {
7001 // TODO: Determine if DSO local can be set to true.
7002 OutlinedFn->setDSOLocal(false);
7004 if (T.isAMDGCN())
7006 else if (T.isNVPTX())
7008 else if (T.isSPIRV())
7010 }
7011}
7012
7013Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7014 StringRef EntryFnIDName) {
7015 if (Config.isTargetDevice()) {
7016 assert(OutlinedFn && "The outlined function must exist if embedded");
7017 return OutlinedFn;
7018 }
7019
7020 return new GlobalVariable(
7021 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7022 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7023}
7024
7025Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7026 StringRef EntryFnName) {
7027 if (OutlinedFn)
7028 return OutlinedFn;
7029
7030 assert(!M.getGlobalVariable(EntryFnName, true) &&
7031 "Named kernel already exists?");
7032 return new GlobalVariable(
7033 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7034 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7035}
7036
7038 TargetRegionEntryInfo &EntryInfo,
7039 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7040 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7041
7042 SmallString<64> EntryFnName;
7043 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7044
7046 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7047 if (!CBResult)
7048 return CBResult.takeError();
7049 OutlinedFn = *CBResult;
7050 } else {
7051 OutlinedFn = nullptr;
7052 }
7053
7054 // If this target outline function is not an offload entry, we don't need to
7055 // register it. This may be in the case of a false if clause, or if there are
7056 // no OpenMP targets.
7057 if (!IsOffloadEntry)
7058 return Error::success();
7059
7060 std::string EntryFnIDName =
7062 ? std::string(EntryFnName)
7063 : createPlatformSpecificName({EntryFnName, "region_id"});
7064
7065 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7066 EntryFnName, EntryFnIDName);
7067 return Error::success();
7068}
7069
7071 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7072 StringRef EntryFnName, StringRef EntryFnIDName) {
7073 if (OutlinedFn)
7074 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7075 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7076 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7078 EntryInfo, EntryAddr, OutlinedFnID,
7080 return OutlinedFnID;
7081}
7082
7084 const LocationDescription &Loc, InsertPointTy AllocaIP,
7085 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7086 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7087 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7089 BodyGenTy BodyGenType)>
7090 BodyGenCB,
7091 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7092 if (!updateToLocation(Loc))
7093 return InsertPointTy();
7094
7095 Builder.restoreIP(CodeGenIP);
7096 // Disable TargetData CodeGen on Device pass.
7097 if (Config.IsTargetDevice.value_or(false)) {
7098 if (BodyGenCB) {
7099 InsertPointOrErrorTy AfterIP =
7100 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7101 if (!AfterIP)
7102 return AfterIP.takeError();
7103 Builder.restoreIP(*AfterIP);
7104 }
7105 return Builder.saveIP();
7106 }
7107
7108 bool IsStandAlone = !BodyGenCB;
7109 MapInfosTy *MapInfo;
7110 // Generate the code for the opening of the data environment. Capture all the
7111 // arguments of the runtime call by reference because they are used in the
7112 // closing of the region.
7113 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7114 InsertPointTy CodeGenIP) -> Error {
7115 MapInfo = &GenMapInfoCB(Builder.saveIP());
7116 if (Error Err = emitOffloadingArrays(
7117 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7118 /*IsNonContiguous=*/true, DeviceAddrCB))
7119 return Err;
7120
7121 TargetDataRTArgs RTArgs;
7123
7124 // Emit the number of elements in the offloading arrays.
7125 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7126
7127 // Source location for the ident struct
7128 if (!SrcLocInfo) {
7129 uint32_t SrcLocStrSize;
7130 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7131 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7132 }
7133
7134 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7135 SrcLocInfo, DeviceID,
7136 PointerNum, RTArgs.BasePointersArray,
7137 RTArgs.PointersArray, RTArgs.SizesArray,
7138 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7139 RTArgs.MappersArray};
7140
7141 if (IsStandAlone) {
7142 assert(MapperFunc && "MapperFunc missing for standalone target data");
7143
7144 auto TaskBodyCB = [&](Value *, Value *,
7146 if (Info.HasNoWait) {
7147 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7151 }
7152
7154 OffloadingArgs);
7155
7156 if (Info.HasNoWait) {
7157 BasicBlock *OffloadContBlock =
7158 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7160 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7162 }
7163 return Error::success();
7164 };
7165
7166 bool RequiresOuterTargetTask = Info.HasNoWait;
7167 if (!RequiresOuterTargetTask)
7168 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7169 /*TargetTaskAllocaIP=*/{}));
7170 else
7171 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7172 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7173 } else {
7174 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7175 omp::OMPRTL___tgt_target_data_begin_mapper);
7176
7177 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7178
7179 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7180 if (isa<AllocaInst>(DeviceMap.second.second)) {
7181 auto *LI =
7182 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7183 Builder.CreateStore(LI, DeviceMap.second.second);
7184 }
7185 }
7186
7187 // If device pointer privatization is required, emit the body of the
7188 // region here. It will have to be duplicated: with and without
7189 // privatization.
7190 InsertPointOrErrorTy AfterIP =
7191 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7192 if (!AfterIP)
7193 return AfterIP.takeError();
7194 Builder.restoreIP(*AfterIP);
7195 }
7196 return Error::success();
7197 };
7198
7199 // If we need device pointer privatization, we need to emit the body of the
7200 // region with no privatization in the 'else' branch of the conditional.
7201 // Otherwise, we don't have to do anything.
7202 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7203 InsertPointTy CodeGenIP) -> Error {
7204 InsertPointOrErrorTy AfterIP =
7205 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7206 if (!AfterIP)
7207 return AfterIP.takeError();
7208 Builder.restoreIP(*AfterIP);
7209 return Error::success();
7210 };
7211
7212 // Generate code for the closing of the data region.
7213 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7214 TargetDataRTArgs RTArgs;
7215 Info.EmitDebug = !MapInfo->Names.empty();
7216 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7217
7218 // Emit the number of elements in the offloading arrays.
7219 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7220
7221 // Source location for the ident struct
7222 if (!SrcLocInfo) {
7223 uint32_t SrcLocStrSize;
7224 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7225 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7226 }
7227
7228 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7229 PointerNum, RTArgs.BasePointersArray,
7230 RTArgs.PointersArray, RTArgs.SizesArray,
7231 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7232 RTArgs.MappersArray};
7233 Function *EndMapperFunc =
7234 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7235
7236 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7237 return Error::success();
7238 };
7239
7240 // We don't have to do anything to close the region if the if clause evaluates
7241 // to false.
7242 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7243 return Error::success();
7244 };
7245
7246 Error Err = [&]() -> Error {
7247 if (BodyGenCB) {
7248 Error Err = [&]() {
7249 if (IfCond)
7250 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7251 return BeginThenGen(AllocaIP, Builder.saveIP());
7252 }();
7253
7254 if (Err)
7255 return Err;
7256
7257 // If we don't require privatization of device pointers, we emit the body
7258 // in between the runtime calls. This avoids duplicating the body code.
7259 InsertPointOrErrorTy AfterIP =
7260 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7261 if (!AfterIP)
7262 return AfterIP.takeError();
7263 restoreIPandDebugLoc(Builder, *AfterIP);
7264
7265 if (IfCond)
7266 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7267 return EndThenGen(AllocaIP, Builder.saveIP());
7268 }
7269 if (IfCond)
7270 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7271 return BeginThenGen(AllocaIP, Builder.saveIP());
7272 }();
7273
7274 if (Err)
7275 return Err;
7276
7277 return Builder.saveIP();
7278}
7279
7282 bool IsGPUDistribute) {
7283 assert((IVSize == 32 || IVSize == 64) &&
7284 "IV size is not compatible with the omp runtime");
7286 if (IsGPUDistribute)
7287 Name = IVSize == 32
7288 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7289 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7290 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7291 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7292 else
7293 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7294 : omp::OMPRTL___kmpc_for_static_init_4u)
7295 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7296 : omp::OMPRTL___kmpc_for_static_init_8u);
7297
7299}
7300
7302 bool IVSigned) {
7303 assert((IVSize == 32 || IVSize == 64) &&
7304 "IV size is not compatible with the omp runtime");
7305 RuntimeFunction Name = IVSize == 32
7306 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7307 : omp::OMPRTL___kmpc_dispatch_init_4u)
7308 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7309 : omp::OMPRTL___kmpc_dispatch_init_8u);
7310
7312}
7313
7315 bool IVSigned) {
7316 assert((IVSize == 32 || IVSize == 64) &&
7317 "IV size is not compatible with the omp runtime");
7318 RuntimeFunction Name = IVSize == 32
7319 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7320 : omp::OMPRTL___kmpc_dispatch_next_4u)
7321 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7322 : omp::OMPRTL___kmpc_dispatch_next_8u);
7323
7325}
7326
7328 bool IVSigned) {
7329 assert((IVSize == 32 || IVSize == 64) &&
7330 "IV size is not compatible with the omp runtime");
7331 RuntimeFunction Name = IVSize == 32
7332 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7333 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7334 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7335 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7336
7338}
7339
7341 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7342}
7343
7345 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7346 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7347
7348 DISubprogram *NewSP = Func->getSubprogram();
7349 if (!NewSP)
7350 return;
7351
7353
7354 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7355 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7356 // Only use cached variable if the arg number matches. This is important
7357 // so that DIVariable created for privatized variables are not discarded.
7358 if (NewVar && (arg == NewVar->getArg()))
7359 return NewVar;
7360
7362 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7363 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7364 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7365 return NewVar;
7366 };
7367
7368 auto UpdateDebugRecord = [&](auto *DR) {
7369 DILocalVariable *OldVar = DR->getVariable();
7370 unsigned ArgNo = 0;
7371 for (auto Loc : DR->location_ops()) {
7372 auto Iter = ValueReplacementMap.find(Loc);
7373 if (Iter != ValueReplacementMap.end()) {
7374 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7375 ArgNo = std::get<1>(Iter->second) + 1;
7376 }
7377 }
7378 if (ArgNo != 0)
7379 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7380 };
7381
7382 // The location and scope of variable intrinsics and records still point to
7383 // the parent function of the target region. Update them.
7384 for (Instruction &I : instructions(Func)) {
7385 if (auto *DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&I))
7386 UpdateDebugRecord(DDI);
7387
7388 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7389 UpdateDebugRecord(&DVR);
7390 }
7391 // An extra argument is passed to the device. Create the debug data for it.
7392 if (OMPBuilder.Config.isTargetDevice()) {
7393 DICompileUnit *CU = NewSP->getUnit();
7394 Module *M = Func->getParent();
7395 DIBuilder DB(*M, true, CU);
7396 DIType *VoidPtrTy =
7397 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7398 DILocalVariable *Var = DB.createParameterVariable(
7399 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7400 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7401 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7402 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7403 &(*Func->begin()));
7404 }
7405}
7406
7408 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7410 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7413 SmallVector<Type *> ParameterTypes;
7414 if (OMPBuilder.Config.isTargetDevice()) {
7415 // Add the "implicit" runtime argument we use to provide launch specific
7416 // information for target devices.
7417 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7418 ParameterTypes.push_back(Int8PtrTy);
7419
7420 // All parameters to target devices are passed as pointers
7421 // or i64. This assumes 64-bit address spaces/pointers.
7422 for (auto &Arg : Inputs)
7423 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7424 ? Arg->getType()
7425 : Type::getInt64Ty(Builder.getContext()));
7426 } else {
7427 for (auto &Arg : Inputs)
7428 ParameterTypes.push_back(Arg->getType());
7429 }
7430
7431 auto BB = Builder.GetInsertBlock();
7432 auto M = BB->getModule();
7433 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7434 /*isVarArg*/ false);
7435 auto Func =
7436 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7437
7438 // Forward target-cpu and target-features function attributes from the
7439 // original function to the new outlined function.
7440 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7441
7442 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7443 if (TargetCpuAttr.isStringAttribute())
7444 Func->addFnAttr(TargetCpuAttr);
7445
7446 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7447 if (TargetFeaturesAttr.isStringAttribute())
7448 Func->addFnAttr(TargetFeaturesAttr);
7449
7450 if (OMPBuilder.Config.isTargetDevice()) {
7451 Value *ExecMode =
7452 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7453 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7454 }
7455
7456 // Save insert point.
7457 IRBuilder<>::InsertPointGuard IPG(Builder);
7458 // We will generate the entries in the outlined function but the debug
7459 // location may still be pointing to the parent function. Reset it now.
7461
7462 // Generate the region into the function.
7463 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7464 Builder.SetInsertPoint(EntryBB);
7465
7466 // Insert target init call in the device compilation pass.
7467 if (OMPBuilder.Config.isTargetDevice())
7468 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7469
7470 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7471
7472 // As we embed the user code in the middle of our target region after we
7473 // generate entry code, we must move what allocas we can into the entry
7474 // block to avoid possible breaking optimisations for device
7475 if (OMPBuilder.Config.isTargetDevice())
7477
7478 // Insert target deinit call in the device compilation pass.
7479 BasicBlock *OutlinedBodyBB =
7480 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7482 Builder.saveIP(),
7483 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7484 if (!AfterIP)
7485 return AfterIP.takeError();
7486 Builder.restoreIP(*AfterIP);
7487 if (OMPBuilder.Config.isTargetDevice())
7488 OMPBuilder.createTargetDeinit(Builder);
7489
7490 // Insert return instruction.
7491 Builder.CreateRetVoid();
7492
7493 // New Alloca IP at entry point of created device function.
7494 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7495 auto AllocaIP = Builder.saveIP();
7496
7497 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7498
7499 // Skip the artificial dyn_ptr on the device.
7500 const auto &ArgRange =
7501 OMPBuilder.Config.isTargetDevice()
7502 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7503 : Func->args();
7504
7506
7507 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7508 // Things like GEP's can come in the form of Constants. Constants and
7509 // ConstantExpr's do not have access to the knowledge of what they're
7510 // contained in, so we must dig a little to find an instruction so we
7511 // can tell if they're used inside of the function we're outlining. We
7512 // also replace the original constant expression with a new instruction
7513 // equivalent; an instruction as it allows easy modification in the
7514 // following loop, as we can now know the constant (instruction) is
7515 // owned by our target function and replaceUsesOfWith can now be invoked
7516 // on it (cannot do this with constants it seems). A brand new one also
7517 // allows us to be cautious as it is perhaps possible the old expression
7518 // was used inside of the function but exists and is used externally
7519 // (unlikely by the nature of a Constant, but still).
7520 // NOTE: We cannot remove dead constants that have been rewritten to
7521 // instructions at this stage, we run the risk of breaking later lowering
7522 // by doing so as we could still be in the process of lowering the module
7523 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7524 // constants we have created rewritten versions of.
7525 if (auto *Const = dyn_cast<Constant>(Input))
7526 convertUsersOfConstantsToInstructions(Const, Func, false);
7527
7528 // Collect users before iterating over them to avoid invalidating the
7529 // iteration in case a user uses Input more than once (e.g. a call
7530 // instruction).
7531 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7532 // Collect all the instructions
7534 if (auto *Instr = dyn_cast<Instruction>(User))
7535 if (Instr->getFunction() == Func)
7536 Instr->replaceUsesOfWith(Input, InputCopy);
7537 };
7538
7539 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7540
7541 // Rewrite uses of input valus to parameters.
7542 for (auto InArg : zip(Inputs, ArgRange)) {
7543 Value *Input = std::get<0>(InArg);
7544 Argument &Arg = std::get<1>(InArg);
7545 Value *InputCopy = nullptr;
7546
7548 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7549 if (!AfterIP)
7550 return AfterIP.takeError();
7551 Builder.restoreIP(*AfterIP);
7552 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7553
7554 // In certain cases a Global may be set up for replacement, however, this
7555 // Global may be used in multiple arguments to the kernel, just segmented
7556 // apart, for example, if we have a global array, that is sectioned into
7557 // multiple mappings (technically not legal in OpenMP, but there is a case
7558 // in Fortran for Common Blocks where this is neccesary), we will end up
7559 // with GEP's into this array inside the kernel, that refer to the Global
7560 // but are technically seperate arguments to the kernel for all intents and
7561 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7562 // index, it will fold into an referal to the Global, if we then encounter
7563 // this folded GEP during replacement all of the references to the
7564 // Global in the kernel will be replaced with the argument we have generated
7565 // that corresponds to it, including any other GEP's that refer to the
7566 // Global that may be other arguments. This will invalidate all of the other
7567 // preceding mapped arguments that refer to the same global that may be
7568 // seperate segments. To prevent this, we defer global processing until all
7569 // other processing has been performed.
7570 if (isa<GlobalValue>(Input)) {
7571 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7572 continue;
7573 }
7574
7575 if (isa<ConstantData>(Input))
7576 continue;
7577
7578 ReplaceValue(Input, InputCopy, Func);
7579 }
7580
7581 // Replace all of our deferred Input values, currently just Globals.
7582 for (auto Deferred : DeferredReplacement)
7583 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7584
7585 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7586 ValueReplacementMap);
7587 return Func;
7588}
7589/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7590/// of pointers containing shared data between the parent task and the created
7591/// task.
7593 IRBuilderBase &Builder,
7594 Value *TaskWithPrivates,
7595 Type *TaskWithPrivatesTy) {
7596
7597 Type *TaskTy = OMPIRBuilder.Task;
7598 LLVMContext &Ctx = Builder.getContext();
7599 Value *TaskT =
7600 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7601 Value *Shareds = TaskT;
7602 // TaskWithPrivatesTy can be one of the following
7603 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7604 // %struct.privates }
7605 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7606 //
7607 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7608 // its first member has to be the task descriptor. TaskTy is the type of the
7609 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7610 // first member of TaskT, gives us the pointer to shared data.
7611 if (TaskWithPrivatesTy != TaskTy)
7612 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7613 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7614}
7615/// Create an entry point for a target task with the following.
7616/// It'll have the following signature
7617/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7618/// This function is called from emitTargetTask once the
7619/// code to launch the target kernel has been outlined already.
7620/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7621/// into the task structure so that the deferred target task can access this
7622/// data even after the stack frame of the generating task has been rolled
7623/// back. Offloading arrays contain base pointers, pointers, sizes etc
7624/// of the data that the target kernel will access. These in effect are the
7625/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7627 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7628 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7629 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7630
7631 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7632 // This is because PrivatesTy is the type of the structure in which
7633 // we pass the offloading arrays to the deferred target task.
7634 assert((!NumOffloadingArrays || PrivatesTy) &&
7635 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7636 "to privatize");
7637
7638 Module &M = OMPBuilder.M;
7639 // KernelLaunchFunction is the target launch function, i.e.
7640 // the function that sets up kernel arguments and calls
7641 // __tgt_target_kernel to launch the kernel on the device.
7642 //
7643 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7644
7645 // StaleCI is the CallInst which is the call to the outlined
7646 // target kernel launch function. If there are local live-in values
7647 // that the outlined function uses then these are aggregated into a structure
7648 // which is passed as the second argument. If there are no local live-in
7649 // values or if all values used by the outlined kernel are global variables,
7650 // then there's only one argument, the threadID. So, StaleCI can be
7651 //
7652 // %structArg = alloca { ptr, ptr }, align 8
7653 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7654 // store ptr %20, ptr %gep_, align 8
7655 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7656 // store ptr %21, ptr %gep_8, align 8
7657 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7658 //
7659 // OR
7660 //
7661 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7663 StaleCI->getIterator());
7664
7665 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7666
7667 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7668 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7669 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7670
7671 auto ProxyFnTy =
7672 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7673 /* isVarArg */ false);
7674 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7675 ".omp_target_task_proxy_func",
7676 Builder.GetInsertBlock()->getModule());
7677 Value *ThreadId = ProxyFn->getArg(0);
7678 Value *TaskWithPrivates = ProxyFn->getArg(1);
7679 ThreadId->setName("thread.id");
7680 TaskWithPrivates->setName("task");
7681
7682 bool HasShareds = SharedArgsOperandNo > 0;
7683 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7684 BasicBlock *EntryBB =
7685 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7686 Builder.SetInsertPoint(EntryBB);
7687
7688 SmallVector<Value *> KernelLaunchArgs;
7689 KernelLaunchArgs.reserve(StaleCI->arg_size());
7690 KernelLaunchArgs.push_back(ThreadId);
7691
7692 if (HasOffloadingArrays) {
7693 assert(TaskTy != TaskWithPrivatesTy &&
7694 "If there are offloading arrays to pass to the target"
7695 "TaskTy cannot be the same as TaskWithPrivatesTy");
7696 (void)TaskTy;
7697 Value *Privates =
7698 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7699 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7700 KernelLaunchArgs.push_back(
7701 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7702 }
7703
7704 if (HasShareds) {
7705 auto *ArgStructAlloca =
7706 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7707 assert(ArgStructAlloca &&
7708 "Unable to find the alloca instruction corresponding to arguments "
7709 "for extracted function");
7710 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7711
7712 AllocaInst *NewArgStructAlloca =
7713 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7714
7715 Value *SharedsSize =
7716 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7717
7719 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7720
7721 Builder.CreateMemCpy(
7722 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7723 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7724 KernelLaunchArgs.push_back(NewArgStructAlloca);
7725 }
7726 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7727 Builder.CreateRetVoid();
7728 return ProxyFn;
7729}
7731
7732 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7733 return GEP->getSourceElementType();
7734 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7735 return Alloca->getAllocatedType();
7736
7737 llvm_unreachable("Unhandled Instruction type");
7738 return nullptr;
7739}
7740// This function returns a struct that has at most two members.
7741// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7742// descriptor. The second member, if needed, is a struct containing arrays
7743// that need to be passed to the offloaded target kernel. For example,
7744// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7745// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7746// respectively, then the types created by this function are
7747//
7748// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7749// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7750// %struct.privates }
7751// %struct.task_with_privates is returned by this function.
7752// If there aren't any offloading arrays to pass to the target kernel,
7753// %struct.kmp_task_ompbuilder_t is returned.
7754static StructType *
7756 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7757
7758 if (OffloadingArraysToPrivatize.empty())
7759 return OMPIRBuilder.Task;
7760
7761 SmallVector<Type *, 4> StructFieldTypes;
7762 for (Value *V : OffloadingArraysToPrivatize) {
7763 assert(V->getType()->isPointerTy() &&
7764 "Expected pointer to array to privatize. Got a non-pointer value "
7765 "instead");
7766 Type *ArrayTy = getOffloadingArrayType(V);
7767 assert(ArrayTy && "ArrayType cannot be nullptr");
7768 StructFieldTypes.push_back(ArrayTy);
7769 }
7770 StructType *PrivatesStructTy =
7771 StructType::create(StructFieldTypes, "struct.privates");
7772 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7773 "struct.task_with_privates");
7774}
7776 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7777 TargetRegionEntryInfo &EntryInfo,
7779 Function *&OutlinedFn, Constant *&OutlinedFnID,
7783
7784 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7785 [&](StringRef EntryFnName) {
7786 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7787 EntryFnName, Inputs, CBFunc,
7788 ArgAccessorFuncCB);
7789 };
7790
7791 return OMPBuilder.emitTargetRegionFunction(
7792 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7793 OutlinedFnID);
7794}
7795
7797 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7800 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7801
7802 // The following explains the code-gen scenario for the `target` directive. A
7803 // similar scneario is followed for other device-related directives (e.g.
7804 // `target enter data`) but in similar fashion since we only need to emit task
7805 // that encapsulates the proper runtime call.
7806 //
7807 // When we arrive at this function, the target region itself has been
7808 // outlined into the function OutlinedFn.
7809 // So at ths point, for
7810 // --------------------------------------------------------------
7811 // void user_code_that_offloads(...) {
7812 // omp target depend(..) map(from:a) map(to:b) private(i)
7813 // do i = 1, 10
7814 // a(i) = b(i) + n
7815 // }
7816 //
7817 // --------------------------------------------------------------
7818 //
7819 // we have
7820 //
7821 // --------------------------------------------------------------
7822 //
7823 // void user_code_that_offloads(...) {
7824 // %.offload_baseptrs = alloca [2 x ptr], align 8
7825 // %.offload_ptrs = alloca [2 x ptr], align 8
7826 // %.offload_mappers = alloca [2 x ptr], align 8
7827 // ;; target region has been outlined and now we need to
7828 // ;; offload to it via a target task.
7829 // }
7830 // void outlined_device_function(ptr a, ptr b, ptr n) {
7831 // n = *n_ptr;
7832 // do i = 1, 10
7833 // a(i) = b(i) + n
7834 // }
7835 //
7836 // We have to now do the following
7837 // (i) Make an offloading call to outlined_device_function using the OpenMP
7838 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7839 // emitted by emitKernelLaunch
7840 // (ii) Create a task entry point function that calls kernel_launch_function
7841 // and is the entry point for the target task. See
7842 // '@.omp_target_task_proxy_func in the pseudocode below.
7843 // (iii) Create a task with the task entry point created in (ii)
7844 //
7845 // That is we create the following
7846 // struct task_with_privates {
7847 // struct kmp_task_ompbuilder_t task_struct;
7848 // struct privates {
7849 // [2 x ptr] ; baseptrs
7850 // [2 x ptr] ; ptrs
7851 // [2 x i64] ; sizes
7852 // }
7853 // }
7854 // void user_code_that_offloads(...) {
7855 // %.offload_baseptrs = alloca [2 x ptr], align 8
7856 // %.offload_ptrs = alloca [2 x ptr], align 8
7857 // %.offload_sizes = alloca [2 x i64], align 8
7858 //
7859 // %structArg = alloca { ptr, ptr, ptr }, align 8
7860 // %strucArg[0] = a
7861 // %strucArg[1] = b
7862 // %strucArg[2] = &n
7863 //
7864 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7865 // sizeof(kmp_task_ompbuilder_t),
7866 // sizeof(structArg),
7867 // @.omp_target_task_proxy_func,
7868 // ...)
7869 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7870 // sizeof(structArg))
7871 // memcpy(target_task_with_privates->privates->baseptrs,
7872 // offload_baseptrs, sizeof(offload_baseptrs)
7873 // memcpy(target_task_with_privates->privates->ptrs,
7874 // offload_ptrs, sizeof(offload_ptrs)
7875 // memcpy(target_task_with_privates->privates->sizes,
7876 // offload_sizes, sizeof(offload_sizes)
7877 // dependencies_array = ...
7878 // ;; if nowait not present
7879 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7880 // call @__kmpc_omp_task_begin_if0(...)
7881 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7882 // %target_task_with_privates)
7883 // call @__kmpc_omp_task_complete_if0(...)
7884 // }
7885 //
7886 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7887 // ptr %task) {
7888 // %structArg = alloca {ptr, ptr, ptr}
7889 // %task_ptr = getelementptr(%task, 0, 0)
7890 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7891 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7892 //
7893 // %offloading_arrays = getelementptr(%task, 0, 1)
7894 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7895 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7896 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7897 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7898 // %offload_sizes, %structArg)
7899 // }
7900 //
7901 // We need the proxy function because the signature of the task entry point
7902 // expected by kmpc_omp_task is always the same and will be different from
7903 // that of the kernel_launch function.
7904 //
7905 // kernel_launch_function is generated by emitKernelLaunch and has the
7906 // always_inline attribute. For this example, it'll look like so:
7907 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7908 // %offload_sizes, %structArg) alwaysinline {
7909 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7910 // ; load aggregated data from %structArg
7911 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7912 // ; offload_sizes
7913 // call i32 @__tgt_target_kernel(...,
7914 // outlined_device_function,
7915 // ptr %kernel_args)
7916 // }
7917 // void outlined_device_function(ptr a, ptr b, ptr n) {
7918 // n = *n_ptr;
7919 // do i = 1, 10
7920 // a(i) = b(i) + n
7921 // }
7922 //
7923 BasicBlock *TargetTaskBodyBB =
7924 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7925 BasicBlock *TargetTaskAllocaBB =
7926 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7927
7928 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7929 TargetTaskAllocaBB->begin());
7930 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7931
7932 OutlineInfo OI;
7933 OI.EntryBB = TargetTaskAllocaBB;
7934 OI.OuterAllocaBB = AllocaIP.getBlock();
7935
7936 // Add the thread ID argument.
7939 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7940
7941 // Generate the task body which will subsequently be outlined.
7942 Builder.restoreIP(TargetTaskBodyIP);
7943 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7944 return Err;
7945
7946 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7947 // it is given. These blocks are enumerated by
7948 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7949 // to be outside the region. In other words, OI.ExitBlock is expected to be
7950 // the start of the region after the outlining. We used to set OI.ExitBlock
7951 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7952 // except when the task body is a single basic block. In that case,
7953 // OI.ExitBlock is set to the single task body block and will get left out of
7954 // the outlining process. So, simply create a new empty block to which we
7955 // uncoditionally branch from where TaskBodyCB left off
7956 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7958 /*IsFinished=*/true);
7959
7960 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7961 bool NeedsTargetTask = HasNoWait && DeviceID;
7962 if (NeedsTargetTask) {
7963 for (auto *V :
7964 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7965 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7966 RTArgs.SizesArray}) {
7967 if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
7968 OffloadingArraysToPrivatize.push_back(V);
7969 OI.ExcludeArgsFromAggregate.push_back(V);
7970 }
7971 }
7972 }
7973 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7974 DeviceID, OffloadingArraysToPrivatize](
7975 Function &OutlinedFn) mutable {
7976 assert(OutlinedFn.hasOneUse() &&
7977 "there must be a single user for the outlined function");
7978
7979 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7980
7981 // The first argument of StaleCI is always the thread id.
7982 // The next few arguments are the pointers to offloading arrays
7983 // if any. (see OffloadingArraysToPrivatize)
7984 // Finally, all other local values that are live-in into the outlined region
7985 // end up in a structure whose pointer is passed as the last argument. This
7986 // piece of data is passed in the "shared" field of the task structure. So,
7987 // we know we have to pass shareds to the task if the number of arguments is
7988 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
7989 // thread id. Further, for safety, we assert that the number of arguments of
7990 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
7991 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
7992 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
7993 assert((!HasShareds ||
7994 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
7995 "Wrong number of arguments for StaleCI when shareds are present");
7996 int SharedArgOperandNo =
7997 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
7998
7999 StructType *TaskWithPrivatesTy =
8000 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8001 StructType *PrivatesTy = nullptr;
8002
8003 if (!OffloadingArraysToPrivatize.empty())
8004 PrivatesTy =
8005 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8006
8008 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8009 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8010
8011 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8012 << "\n");
8013
8014 Builder.SetInsertPoint(StaleCI);
8015
8016 // Gather the arguments for emitting the runtime call.
8017 uint32_t SrcLocStrSize;
8018 Constant *SrcLocStr =
8020 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8021
8022 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8023 //
8024 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8025 // the DeviceID to the deferred task and also since
8026 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8027 Function *TaskAllocFn =
8028 !NeedsTargetTask
8029 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8031 OMPRTL___kmpc_omp_target_task_alloc);
8032
8033 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8034 // call.
8035 Value *ThreadID = getOrCreateThreadID(Ident);
8036
8037 // Argument - `sizeof_kmp_task_t` (TaskSize)
8038 // Tasksize refers to the size in bytes of kmp_task_t data structure
8039 // plus any other data to be passed to the target task, if any, which
8040 // is packed into a struct. kmp_task_t and the struct so created are
8041 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8042 Value *TaskSize = Builder.getInt64(
8043 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8044
8045 // Argument - `sizeof_shareds` (SharedsSize)
8046 // SharedsSize refers to the shareds array size in the kmp_task_t data
8047 // structure.
8048 Value *SharedsSize = Builder.getInt64(0);
8049 if (HasShareds) {
8050 auto *ArgStructAlloca =
8051 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8052 assert(ArgStructAlloca &&
8053 "Unable to find the alloca instruction corresponding to arguments "
8054 "for extracted function");
8055 auto *ArgStructType =
8056 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8057 assert(ArgStructType && "Unable to find struct type corresponding to "
8058 "arguments for extracted function");
8059 SharedsSize =
8061 }
8062
8063 // Argument - `flags`
8064 // Task is tied iff (Flags & 1) == 1.
8065 // Task is untied iff (Flags & 1) == 0.
8066 // Task is final iff (Flags & 2) == 2.
8067 // Task is not final iff (Flags & 2) == 0.
8068 // A target task is not final and is untied.
8069 Value *Flags = Builder.getInt32(0);
8070
8071 // Emit the @__kmpc_omp_task_alloc runtime call
8072 // The runtime call returns a pointer to an area where the task captured
8073 // variables must be copied before the task is run (TaskData)
8074 CallInst *TaskData = nullptr;
8075
8076 SmallVector<llvm::Value *> TaskAllocArgs = {
8077 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8078 /*flags=*/Flags,
8079 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8080 /*task_func=*/ProxyFn};
8081
8082 if (NeedsTargetTask) {
8083 assert(DeviceID && "Expected non-empty device ID.");
8084 TaskAllocArgs.push_back(DeviceID);
8085 }
8086
8087 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8088
8089 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8090 if (HasShareds) {
8091 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8093 *this, Builder, TaskData, TaskWithPrivatesTy);
8094 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8095 SharedsSize);
8096 }
8097 if (!OffloadingArraysToPrivatize.empty()) {
8098 Value *Privates =
8099 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8100 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8101 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8102 [[maybe_unused]] Type *ArrayType =
8103 getOffloadingArrayType(PtrToPrivatize);
8104 assert(ArrayType && "ArrayType cannot be nullptr");
8105
8106 Type *ElementType = PrivatesTy->getElementType(i);
8107 assert(ElementType == ArrayType &&
8108 "ElementType should match ArrayType");
8109 (void)ArrayType;
8110
8111 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8113 Dst, Alignment, PtrToPrivatize, Alignment,
8115 }
8116 }
8117
8118 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8119
8120 // ---------------------------------------------------------------
8121 // V5.2 13.8 target construct
8122 // If the nowait clause is present, execution of the target task
8123 // may be deferred. If the nowait clause is not present, the target task is
8124 // an included task.
8125 // ---------------------------------------------------------------
8126 // The above means that the lack of a nowait on the target construct
8127 // translates to '#pragma omp task if(0)'
8128 if (!NeedsTargetTask) {
8129 if (DepArray) {
8130 Function *TaskWaitFn =
8131 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8133 TaskWaitFn,
8134 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8135 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8136 /*dep_list=*/DepArray,
8137 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8138 /*noalias_dep_list=*/
8140 }
8141 // Included task.
8142 Function *TaskBeginFn =
8143 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8144 Function *TaskCompleteFn =
8145 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8146 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8147 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8148 CI->setDebugLoc(StaleCI->getDebugLoc());
8149 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8150 } else if (DepArray) {
8151 // HasNoWait - meaning the task may be deferred. Call
8152 // __kmpc_omp_task_with_deps if there are dependencies,
8153 // else call __kmpc_omp_task
8154 Function *TaskFn =
8155 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8157 TaskFn,
8158 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8159 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8161 } else {
8162 // Emit the @__kmpc_omp_task runtime call to spawn the task
8163 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8164 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8165 }
8166
8167 StaleCI->eraseFromParent();
8168 for (Instruction *I : llvm::reverse(ToBeDeleted))
8169 I->eraseFromParent();
8170 };
8171 addOutlineInfo(std::move(OI));
8172
8173 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8174 << *(Builder.GetInsertBlock()) << "\n");
8175 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8177 << "\n");
8178 return Builder.saveIP();
8179}
8180
8182 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8183 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8184 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8185 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8186 if (Error Err =
8187 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8188 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8189 return Err;
8190 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8191 return Error::success();
8192}
8193
8194static void emitTargetCall(
8195 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8200 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8205 bool HasNoWait) {
8206 // Generate a function call to the host fallback implementation of the target
8207 // region. This is called by the host when no offload entry was generated for
8208 // the target region and when the offloading call fails at runtime.
8209 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8211 Builder.restoreIP(IP);
8212 Builder.CreateCall(OutlinedFn, Args);
8213 return Builder.saveIP();
8214 };
8215
8216 bool HasDependencies = Dependencies.size() > 0;
8217 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8218
8220
8221 auto TaskBodyCB =
8222 [&](Value *DeviceID, Value *RTLoc,
8223 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8224 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8225 // produce any.
8227 // emitKernelLaunch makes the necessary runtime call to offload the
8228 // kernel. We then outline all that code into a separate function
8229 // ('kernel_launch_function' in the pseudo code above). This function is
8230 // then called by the target task proxy function (see
8231 // '@.omp_target_task_proxy_func' in the pseudo code above)
8232 // "@.omp_target_task_proxy_func' is generated by
8233 // emitTargetTaskProxyFunction.
8234 if (OutlinedFnID && DeviceID)
8235 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8236 EmitTargetCallFallbackCB, KArgs,
8237 DeviceID, RTLoc, TargetTaskAllocaIP);
8238
8239 // We only need to do the outlining if `DeviceID` is set to avoid calling
8240 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8241 // generating the `else` branch of an `if` clause.
8242 //
8243 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8244 // In this case, we execute the host implementation directly.
8245 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8246 }());
8247
8248 OMPBuilder.Builder.restoreIP(AfterIP);
8249 return Error::success();
8250 };
8251
8252 auto &&EmitTargetCallElse =
8253 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8255 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8256 // produce any.
8258 if (RequiresOuterTargetTask) {
8259 // Arguments that are intended to be directly forwarded to an
8260 // emitKernelLaunch call are pased as nullptr, since
8261 // OutlinedFnID=nullptr results in that call not being done.
8263 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8264 /*RTLoc=*/nullptr, AllocaIP,
8265 Dependencies, EmptyRTArgs, HasNoWait);
8266 }
8267 return EmitTargetCallFallbackCB(Builder.saveIP());
8268 }());
8269
8270 Builder.restoreIP(AfterIP);
8271 return Error::success();
8272 };
8273
8274 auto &&EmitTargetCallThen =
8275 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8277 Info.HasNoWait = HasNoWait;
8278 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8280 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8281 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8282 /*IsNonContiguous=*/true,
8283 /*ForEndCall=*/false))
8284 return Err;
8285
8286 SmallVector<Value *, 3> NumTeamsC;
8287 for (auto [DefaultVal, RuntimeVal] :
8288 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8289 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8290 : Builder.getInt32(DefaultVal));
8291
8292 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8293 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8294 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8295 if (Clause)
8296 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8297 /*isSigned=*/false);
8298 return Clause;
8299 };
8300 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8301 if (Clause)
8302 Result =
8303 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8304 Result, Clause)
8305 : Clause;
8306 };
8307
8308 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8309 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8310 SmallVector<Value *, 3> NumThreadsC;
8311 Value *MaxThreadsClause =
8312 RuntimeAttrs.TeamsThreadLimit.size() == 1
8313 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8314 : nullptr;
8315
8316 for (auto [TeamsVal, TargetVal] : zip_equal(
8317 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8318 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8319 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8320
8321 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8322 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8323
8324 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8325 }
8326
8327 unsigned NumTargetItems = Info.NumberOfPtrs;
8328 // TODO: Use correct device ID
8329 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8330 uint32_t SrcLocStrSize;
8331 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8332 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8333 llvm::omp::IdentFlag(0), 0);
8334
8335 Value *TripCount = RuntimeAttrs.LoopTripCount
8336 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8337 Builder.getInt64Ty(),
8338 /*isSigned=*/false)
8339 : Builder.getInt64(0);
8340
8341 // TODO: Use correct DynCGGroupMem
8342 Value *DynCGGroupMem = Builder.getInt32(0);
8343
8344 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8345 NumTeamsC, NumThreadsC,
8346 DynCGGroupMem, HasNoWait);
8347
8348 // Assume no error was returned because TaskBodyCB and
8349 // EmitTargetCallFallbackCB don't produce any.
8351 // The presence of certain clauses on the target directive require the
8352 // explicit generation of the target task.
8353 if (RequiresOuterTargetTask)
8354 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8355 Dependencies, KArgs.RTArgs,
8356 Info.HasNoWait);
8357
8358 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8359 EmitTargetCallFallbackCB, KArgs,
8360 DeviceID, RTLoc, AllocaIP);
8361 }());
8362
8363 Builder.restoreIP(AfterIP);
8364 return Error::success();
8365 };
8366
8367 // If we don't have an ID for the target region, it means an offload entry
8368 // wasn't created. In this case we just run the host fallback directly and
8369 // ignore any potential 'if' clauses.
8370 if (!OutlinedFnID) {
8371 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8372 return;
8373 }
8374
8375 // If there's no 'if' clause, only generate the kernel launch code path.
8376 if (!IfCond) {
8377 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8378 return;
8379 }
8380
8381 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8382 EmitTargetCallElse, AllocaIP));
8383}
8384
8386 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8387 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8388 TargetRegionEntryInfo &EntryInfo,
8389 const TargetKernelDefaultAttrs &DefaultAttrs,
8390 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8391 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8394 CustomMapperCallbackTy CustomMapperCB,
8395 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8396
8397 if (!updateToLocation(Loc))
8398 return InsertPointTy();
8399
8400 Builder.restoreIP(CodeGenIP);
8401
8402 Function *OutlinedFn;
8403 Constant *OutlinedFnID = nullptr;
8404 // The target region is outlined into its own function. The LLVM IR for
8405 // the target region itself is generated using the callbacks CBFunc
8406 // and ArgAccessorFuncCB
8408 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8409 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8410 return Err;
8411
8412 // If we are not on the target device, then we need to generate code
8413 // to make a remote call (offload) to the previously outlined function
8414 // that represents the target region. Do that now.
8415 if (!Config.isTargetDevice())
8416 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8417 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8418 CustomMapperCB, Dependencies, HasNowait);
8419 return Builder.saveIP();
8420}
8421
8422std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8423 StringRef FirstSeparator,
8424 StringRef Separator) {
8425 SmallString<128> Buffer;
8427 StringRef Sep = FirstSeparator;
8428 for (StringRef Part : Parts) {
8429 OS << Sep << Part;
8430 Sep = Separator;
8431 }
8432 return OS.str().str();
8433}
8434
8435std::string
8437 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8438 Config.separator());
8439}
8440
8443 unsigned AddressSpace) {
8444 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8445 if (Elem.second) {
8446 assert(Elem.second->getValueType() == Ty &&
8447 "OMP internal variable has different type than requested");
8448 } else {
8449 // TODO: investigate the appropriate linkage type used for the global
8450 // variable for possibly changing that to internal or private, or maybe
8451 // create different versions of the function for different OMP internal
8452 // variables.
8453 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8456 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8457 Constant::getNullValue(Ty), Elem.first(),
8458 /*InsertBefore=*/nullptr,
8460 const DataLayout &DL = M.getDataLayout();
8461 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8462 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8463 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8464 Elem.second = GV;
8465 }
8466
8467 return Elem.second;
8468}
8469
8470Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8471 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8472 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8473 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8474}
8475
8478 Value *Null =
8479 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
8480 Value *SizeGep =
8481 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8482 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8483 return SizePtrToInt;
8484}
8485
8488 std::string VarName) {
8489 llvm::Constant *MaptypesArrayInit =
8491 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8492 M, MaptypesArrayInit->getType(),
8493 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8494 VarName);
8495 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8496 return MaptypesArrayGlobal;
8497}
8498
8500 InsertPointTy AllocaIP,
8501 unsigned NumOperands,
8502 struct MapperAllocas &MapperAllocas) {
8503 if (!updateToLocation(Loc))
8504 return;
8505
8506 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8507 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8508 Builder.restoreIP(AllocaIP);
8509 AllocaInst *ArgsBase = Builder.CreateAlloca(
8510 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8511 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8512 ".offload_ptrs");
8513 AllocaInst *ArgSizes = Builder.CreateAlloca(
8514 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8515 updateToLocation(Loc);
8516 MapperAllocas.ArgsBase = ArgsBase;
8517 MapperAllocas.Args = Args;
8518 MapperAllocas.ArgSizes = ArgSizes;
8519}
8520
8522 Function *MapperFunc, Value *SrcLocInfo,
8523 Value *MaptypesArg, Value *MapnamesArg,
8525 int64_t DeviceID, unsigned NumOperands) {
8526 if (!updateToLocation(Loc))
8527 return;
8528
8529 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8530 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8531 Value *ArgsBaseGEP =
8533 {Builder.getInt32(0), Builder.getInt32(0)});
8534 Value *ArgsGEP =
8536 {Builder.getInt32(0), Builder.getInt32(0)});
8537 Value *ArgSizesGEP =
8539 {Builder.getInt32(0), Builder.getInt32(0)});
8540 Value *NullPtr =
8541 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8542 Builder.CreateCall(MapperFunc,
8543 {SrcLocInfo, Builder.getInt64(DeviceID),
8544 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8545 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8546}
8547
8549 TargetDataRTArgs &RTArgs,
8550 TargetDataInfo &Info,
8551 bool ForEndCall) {
8552 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8553 "expected region end call to runtime only when end call is separate");
8554 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8555 auto VoidPtrTy = UnqualPtrTy;
8556 auto VoidPtrPtrTy = UnqualPtrTy;
8557 auto Int64Ty = Type::getInt64Ty(M.getContext());
8558 auto Int64PtrTy = UnqualPtrTy;
8559
8560 if (!Info.NumberOfPtrs) {
8561 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8562 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8563 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8564 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8565 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8566 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8567 return;
8568 }
8569
8571 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8572 Info.RTArgs.BasePointersArray,
8573 /*Idx0=*/0, /*Idx1=*/0);
8575 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8576 /*Idx0=*/0,
8577 /*Idx1=*/0);
8579 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8580 /*Idx0=*/0, /*Idx1=*/0);
8582 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8583 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8584 : Info.RTArgs.MapTypesArray,
8585 /*Idx0=*/0,
8586 /*Idx1=*/0);
8587
8588 // Only emit the mapper information arrays if debug information is
8589 // requested.
8590 if (!Info.EmitDebug)
8591 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8592 else
8594 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8595 /*Idx0=*/0,
8596 /*Idx1=*/0);
8597 // If there is no user-defined mapper, set the mapper array to nullptr to
8598 // avoid an unnecessary data privatization
8599 if (!Info.HasMapper)
8600 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8601 else
8602 RTArgs.MappersArray =
8603 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8604}
8605
8607 InsertPointTy CodeGenIP,
8608 MapInfosTy &CombinedInfo,
8609 TargetDataInfo &Info) {
8611 CombinedInfo.NonContigInfo;
8612
8613 // Build an array of struct descriptor_dim and then assign it to
8614 // offload_args.
8615 //
8616 // struct descriptor_dim {
8617 // uint64_t offset;
8618 // uint64_t count;
8619 // uint64_t stride
8620 // };
8621 Type *Int64Ty = Builder.getInt64Ty();
8623 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8624 "struct.descriptor_dim");
8625
8626 enum { OffsetFD = 0, CountFD, StrideFD };
8627 // We need two index variable here since the size of "Dims" is the same as
8628 // the size of Components, however, the size of offset, count, and stride is
8629 // equal to the size of base declaration that is non-contiguous.
8630 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8631 // Skip emitting ir if dimension size is 1 since it cannot be
8632 // non-contiguous.
8633 if (NonContigInfo.Dims[I] == 1)
8634 continue;
8635 Builder.restoreIP(AllocaIP);
8636 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8637 AllocaInst *DimsAddr =
8638 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8639 Builder.restoreIP(CodeGenIP);
8640 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8641 unsigned RevIdx = EE - II - 1;
8642 Value *DimsLVal = Builder.CreateInBoundsGEP(
8643 DimsAddr->getAllocatedType(), DimsAddr,
8644 {Builder.getInt64(0), Builder.getInt64(II)});
8645 // Offset
8646 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8648 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8649 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8650 // Count
8651 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8653 NonContigInfo.Counts[L][RevIdx], CountLVal,
8654 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8655 // Stride
8656 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8658 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8659 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8660 }
8661 // args[I] = &dims
8662 Builder.restoreIP(CodeGenIP);
8664 DimsAddr, Builder.getPtrTy());
8666 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8667 Info.RTArgs.PointersArray, 0, I);
8670 ++L;
8671 }
8672}
8673
8674void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8675 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8676 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8677 BasicBlock *ExitBB, bool IsInit) {
8678 StringRef Prefix = IsInit ? ".init" : ".del";
8679
8680 // Evaluate if this is an array section.
8682 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8683 Value *IsArray =
8684 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8685 Value *DeleteBit = Builder.CreateAnd(
8686 MapType,
8688 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8689 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8690 Value *DeleteCond;
8691 Value *Cond;
8692 if (IsInit) {
8693 // base != begin?
8694 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8695 // IsPtrAndObj?
8696 Value *PtrAndObjBit = Builder.CreateAnd(
8697 MapType,
8699 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8700 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8701 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8702 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8703 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8704 DeleteCond = Builder.CreateIsNull(
8705 DeleteBit,
8706 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8707 } else {
8708 Cond = IsArray;
8709 DeleteCond = Builder.CreateIsNotNull(
8710 DeleteBit,
8711 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8712 }
8713 Cond = Builder.CreateAnd(Cond, DeleteCond);
8714 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8715
8716 emitBlock(BodyBB, MapperFn);
8717 // Get the array size by multiplying element size and element number (i.e., \p
8718 // Size).
8719 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8720 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8721 // memory allocation/deletion purpose only.
8722 Value *MapTypeArg = Builder.CreateAnd(
8723 MapType,
8725 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8726 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8727 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8728 MapTypeArg = Builder.CreateOr(
8729 MapTypeArg,
8731 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8732 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8733
8734 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8735 // data structure.
8736 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8737 ArraySize, MapTypeArg, MapName};
8739 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8740 OffloadingArgs);
8741}
8742
8745 llvm::Value *BeginArg)>
8746 GenMapInfoCB,
8747 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8748 SmallVector<Type *> Params;
8749 Params.emplace_back(Builder.getPtrTy());
8750 Params.emplace_back(Builder.getPtrTy());
8751 Params.emplace_back(Builder.getPtrTy());
8754 Params.emplace_back(Builder.getPtrTy());
8755
8756 auto *FnTy =
8757 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8758
8759 SmallString<64> TyStr;
8760 raw_svector_ostream Out(TyStr);
8761 Function *MapperFn =
8763 MapperFn->addFnAttr(Attribute::NoInline);
8764 MapperFn->addFnAttr(Attribute::NoUnwind);
8765 MapperFn->addParamAttr(0, Attribute::NoUndef);
8766 MapperFn->addParamAttr(1, Attribute::NoUndef);
8767 MapperFn->addParamAttr(2, Attribute::NoUndef);
8768 MapperFn->addParamAttr(3, Attribute::NoUndef);
8769 MapperFn->addParamAttr(4, Attribute::NoUndef);
8770 MapperFn->addParamAttr(5, Attribute::NoUndef);
8771
8772 // Start the mapper function code generation.
8773 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8774 auto SavedIP = Builder.saveIP();
8775 Builder.SetInsertPoint(EntryBB);
8776
8777 Value *MapperHandle = MapperFn->getArg(0);
8778 Value *BaseIn = MapperFn->getArg(1);
8779 Value *BeginIn = MapperFn->getArg(2);
8780 Value *Size = MapperFn->getArg(3);
8781 Value *MapType = MapperFn->getArg(4);
8782 Value *MapName = MapperFn->getArg(5);
8783
8784 // Compute the starting and end addresses of array elements.
8785 // Prepare common arguments for array initiation and deletion.
8786 // Convert the size in bytes into the number of array elements.
8787 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8789 Value *PtrBegin = BeginIn;
8790 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8791
8792 // Emit array initiation if this is an array section and \p MapType indicates
8793 // that memory allocation is required.
8794 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8795 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8796 MapType, MapName, ElementSize, HeadBB,
8797 /*IsInit=*/true);
8798
8799 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8800
8801 // Emit the loop header block.
8802 emitBlock(HeadBB, MapperFn);
8803 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8804 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8805 // Evaluate whether the initial condition is satisfied.
8806 Value *IsEmpty =
8807 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8808 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8809
8810 // Emit the loop body block.
8811 emitBlock(BodyBB, MapperFn);
8812 BasicBlock *LastBB = BodyBB;
8813 PHINode *PtrPHI =
8814 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8815 PtrPHI->addIncoming(PtrBegin, HeadBB);
8816
8817 // Get map clause information. Fill up the arrays with all mapped variables.
8818 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8819 if (!Info)
8820 return Info.takeError();
8821
8822 // Call the runtime API __tgt_mapper_num_components to get the number of
8823 // pre-existing components.
8824 Value *OffloadingArgs[] = {MapperHandle};
8825 Value *PreviousSize = Builder.CreateCall(
8826 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8827 OffloadingArgs);
8828 Value *ShiftedPreviousSize =
8830
8831 // Fill up the runtime mapper handle for all components.
8832 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8833 Value *CurBaseArg = Info->BasePointers[I];
8834 Value *CurBeginArg = Info->Pointers[I];
8835 Value *CurSizeArg = Info->Sizes[I];
8836 Value *CurNameArg = Info->Names.size()
8837 ? Info->Names[I]
8839
8840 // Extract the MEMBER_OF field from the map type.
8841 Value *OriMapType = Builder.getInt64(
8842 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8843 Info->Types[I]));
8844 Value *MemberMapType =
8845 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8846
8847 // Combine the map type inherited from user-defined mapper with that
8848 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8849 // bits of the \a MapType, which is the input argument of the mapper
8850 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8851 // bits of MemberMapType.
8852 // [OpenMP 5.0], 1.2.6. map-type decay.
8853 // | alloc | to | from | tofrom | release | delete
8854 // ----------------------------------------------------------
8855 // alloc | alloc | alloc | alloc | alloc | release | delete
8856 // to | alloc | to | alloc | to | release | delete
8857 // from | alloc | alloc | from | from | release | delete
8858 // tofrom | alloc | to | from | tofrom | release | delete
8859 Value *LeftToFrom = Builder.CreateAnd(
8860 MapType,
8862 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8863 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8864 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8865 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8866 BasicBlock *AllocElseBB =
8867 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8868 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8869 BasicBlock *ToElseBB =
8870 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8871 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8872 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8873 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8874 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8875 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8876 emitBlock(AllocBB, MapperFn);
8877 Value *AllocMapType = Builder.CreateAnd(
8878 MemberMapType,
8880 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8881 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8882 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8883 Builder.CreateBr(EndBB);
8884 emitBlock(AllocElseBB, MapperFn);
8885 Value *IsTo = Builder.CreateICmpEQ(
8886 LeftToFrom,
8888 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8889 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8890 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8891 // In case of to, clear OMP_MAP_FROM.
8892 emitBlock(ToBB, MapperFn);
8893 Value *ToMapType = Builder.CreateAnd(
8894 MemberMapType,
8896 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8897 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8898 Builder.CreateBr(EndBB);
8899 emitBlock(ToElseBB, MapperFn);
8900 Value *IsFrom = Builder.CreateICmpEQ(
8901 LeftToFrom,
8903 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8905 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8906 // In case of from, clear OMP_MAP_TO.
8907 emitBlock(FromBB, MapperFn);
8908 Value *FromMapType = Builder.CreateAnd(
8909 MemberMapType,
8911 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8912 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8913 // In case of tofrom, do nothing.
8914 emitBlock(EndBB, MapperFn);
8915 LastBB = EndBB;
8916 PHINode *CurMapType =
8917 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8918 CurMapType->addIncoming(AllocMapType, AllocBB);
8919 CurMapType->addIncoming(ToMapType, ToBB);
8920 CurMapType->addIncoming(FromMapType, FromBB);
8921 CurMapType->addIncoming(MemberMapType, ToElseBB);
8922
8923 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8924 CurSizeArg, CurMapType, CurNameArg};
8925
8926 auto ChildMapperFn = CustomMapperCB(I);
8927 if (!ChildMapperFn)
8928 return ChildMapperFn.takeError();
8929 if (*ChildMapperFn) {
8930 // Call the corresponding mapper function.
8931 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8932 } else {
8933 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8934 // data structure.
8936 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8937 OffloadingArgs);
8938 }
8939 }
8940
8941 // Update the pointer to point to the next element that needs to be mapped,
8942 // and check whether we have mapped all elements.
8943 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8944 "omp.arraymap.next");
8945 PtrPHI->addIncoming(PtrNext, LastBB);
8946 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8947 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8948 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8949
8950 emitBlock(ExitBB, MapperFn);
8951 // Emit array deletion if this is an array section and \p MapType indicates
8952 // that deletion is required.
8953 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8954 MapType, MapName, ElementSize, DoneBB,
8955 /*IsInit=*/false);
8956
8957 // Emit the function exit block.
8958 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8959
8961 Builder.restoreIP(SavedIP);
8962 return MapperFn;
8963}
8964
8966 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8967 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8968 bool IsNonContiguous,
8969 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8970
8971 // Reset the array information.
8972 Info.clearArrayInfo();
8973 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8974
8975 if (Info.NumberOfPtrs == 0)
8976 return Error::success();
8977
8978 Builder.restoreIP(AllocaIP);
8979 // Detect if we have any capture size requiring runtime evaluation of the
8980 // size so that a constant array could be eventually used.
8981 ArrayType *PointerArrayType =
8982 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8983
8984 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8985 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8986
8987 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8988 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8989 AllocaInst *MappersArray = Builder.CreateAlloca(
8990 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8991 Info.RTArgs.MappersArray = MappersArray;
8992
8993 // If we don't have any VLA types or other types that require runtime
8994 // evaluation, we can use a constant array for the map sizes, otherwise we
8995 // need to fill up the arrays as we do for the pointers.
8996 Type *Int64Ty = Builder.getInt64Ty();
8997 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8998 ConstantInt::get(Int64Ty, 0));
8999 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9000 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9001 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9002 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9003 if (IsNonContiguous &&
9004 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9005 CombinedInfo.Types[I] &
9006 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9007 ConstSizes[I] =
9008 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9009 else
9010 ConstSizes[I] = CI;
9011 continue;
9012 }
9013 }
9014 RuntimeSizes.set(I);
9015 }
9016
9017 if (RuntimeSizes.all()) {
9018 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9019 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9020 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9021 restoreIPandDebugLoc(Builder, CodeGenIP);
9022 } else {
9023 auto *SizesArrayInit = ConstantArray::get(
9024 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9025 std::string Name = createPlatformSpecificName({"offload_sizes"});
9026 auto *SizesArrayGbl =
9027 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9028 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9029 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9030
9031 if (!RuntimeSizes.any()) {
9032 Info.RTArgs.SizesArray = SizesArrayGbl;
9033 } else {
9034 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9035 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9036 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9038 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9039 Buffer->setAlignment(OffloadSizeAlign);
9040 restoreIPandDebugLoc(Builder, CodeGenIP);
9042 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9043 SizesArrayGbl, OffloadSizeAlign,
9045 IndexSize,
9046 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9047
9048 Info.RTArgs.SizesArray = Buffer;
9049 }
9050 restoreIPandDebugLoc(Builder, CodeGenIP);
9051 }
9052
9053 // The map types are always constant so we don't need to generate code to
9054 // fill arrays. Instead, we create an array constant.
9056 for (auto mapFlag : CombinedInfo.Types)
9057 Mapping.push_back(
9058 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9059 mapFlag));
9060 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9061 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9062 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9063
9064 // The information types are only built if provided.
9065 if (!CombinedInfo.Names.empty()) {
9066 auto *MapNamesArrayGbl = createOffloadMapnames(
9067 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9068 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9069 Info.EmitDebug = true;
9070 } else {
9071 Info.RTArgs.MapNamesArray =
9073 Info.EmitDebug = false;
9074 }
9075
9076 // If there's a present map type modifier, it must not be applied to the end
9077 // of a region, so generate a separate map type array in that case.
9078 if (Info.separateBeginEndCalls()) {
9079 bool EndMapTypesDiffer = false;
9080 for (uint64_t &Type : Mapping) {
9081 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9082 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9083 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9084 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9085 EndMapTypesDiffer = true;
9086 }
9087 }
9088 if (EndMapTypesDiffer) {
9089 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9090 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9091 }
9092 }
9093
9094 PointerType *PtrTy = Builder.getPtrTy();
9095 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9096 Value *BPVal = CombinedInfo.BasePointers[I];
9098 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9099 0, I);
9100 Builder.CreateAlignedStore(BPVal, BP,
9102
9103 if (Info.requiresDevicePointerInfo()) {
9104 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9105 CodeGenIP = Builder.saveIP();
9106 Builder.restoreIP(AllocaIP);
9107 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9108 Builder.restoreIP(CodeGenIP);
9109 if (DeviceAddrCB)
9110 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9111 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9112 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9113 if (DeviceAddrCB)
9114 DeviceAddrCB(I, BP);
9115 }
9116 }
9117
9118 Value *PVal = CombinedInfo.Pointers[I];
9120 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9121 I);
9122 // TODO: Check alignment correct.
9125
9126 if (RuntimeSizes.test(I)) {
9128 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9129 /*Idx0=*/0,
9130 /*Idx1=*/I);
9132 Int64Ty,
9133 /*isSigned=*/true),
9134 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9135 }
9136 // Fill up the mapper array.
9137 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9138 Value *MFunc = ConstantPointerNull::get(PtrTy);
9139
9140 auto CustomMFunc = CustomMapperCB(I);
9141 if (!CustomMFunc)
9142 return CustomMFunc.takeError();
9143 if (*CustomMFunc)
9144 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9145
9147 MappersArray->getAllocatedType(), MappersArray,
9148 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9150 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9151 }
9152
9153 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9154 Info.NumberOfPtrs == 0)
9155 return Error::success();
9156 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9157 return Error::success();
9158}
9159
9162
9163 if (!CurBB || CurBB->getTerminator()) {
9164 // If there is no insert point or the previous block is already
9165 // terminated, don't touch it.
9166 } else {
9167 // Otherwise, create a fall-through branch.
9169 }
9170
9172}
9173
9175 bool IsFinished) {
9177
9178 // Fall out of the current block (if necessary).
9179 emitBranch(BB);
9180
9181 if (IsFinished && BB->use_empty()) {
9182 BB->eraseFromParent();
9183 return;
9184 }
9185
9186 // Place the block after the current block, if possible, or else at
9187 // the end of the function.
9188 if (CurBB && CurBB->getParent())
9189 CurFn->insert(std::next(CurBB->getIterator()), BB);
9190 else
9191 CurFn->insert(CurFn->end(), BB);
9193}
9194
9196 BodyGenCallbackTy ElseGen,
9197 InsertPointTy AllocaIP) {
9198 // If the condition constant folds and can be elided, try to avoid emitting
9199 // the condition and the dead arm of the if/else.
9200 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9201 auto CondConstant = CI->getSExtValue();
9202 if (CondConstant)
9203 return ThenGen(AllocaIP, Builder.saveIP());
9204
9205 return ElseGen(AllocaIP, Builder.saveIP());
9206 }
9207
9209
9210 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9211 // emit the conditional branch.
9212 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9213 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9214 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9215 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9216 // Emit the 'then' code.
9217 emitBlock(ThenBlock, CurFn);
9218 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9219 return Err;
9220 emitBranch(ContBlock);
9221 // Emit the 'else' code if present.
9222 // There is no need to emit line number for unconditional branch.
9223 emitBlock(ElseBlock, CurFn);
9224 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9225 return Err;
9226 // There is no need to emit line number for unconditional branch.
9227 emitBranch(ContBlock);
9228 // Emit the continuation block for code after the if.
9229 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9230 return Error::success();
9231}
9232
9233bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9234 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9237 "Unexpected Atomic Ordering.");
9238
9239 bool Flush = false;
9241
9242 switch (AK) {
9243 case Read:
9246 FlushAO = AtomicOrdering::Acquire;
9247 Flush = true;
9248 }
9249 break;
9250 case Write:
9251 case Compare:
9252 case Update:
9255 FlushAO = AtomicOrdering::Release;
9256 Flush = true;
9257 }
9258 break;
9259 case Capture:
9260 switch (AO) {
9262 FlushAO = AtomicOrdering::Acquire;
9263 Flush = true;
9264 break;
9266 FlushAO = AtomicOrdering::Release;
9267 Flush = true;
9268 break;
9272 Flush = true;
9273 break;
9274 default:
9275 // do nothing - leave silently.
9276 break;
9277 }
9278 }
9279
9280 if (Flush) {
9281 // Currently Flush RT call still doesn't take memory_ordering, so for when
9282 // that happens, this tries to do the resolution of which atomic ordering
9283 // to use with but issue the flush call
9284 // TODO: pass `FlushAO` after memory ordering support is added
9285 (void)FlushAO;
9286 emitFlush(Loc);
9287 }
9288
9289 // for AO == AtomicOrdering::Monotonic and all other case combinations
9290 // do nothing
9291 return Flush;
9292}
9293
9297 AtomicOrdering AO, InsertPointTy AllocaIP) {
9298 if (!updateToLocation(Loc))
9299 return Loc.IP;
9300
9301 assert(X.Var->getType()->isPointerTy() &&
9302 "OMP Atomic expects a pointer to target memory");
9303 Type *XElemTy = X.ElemTy;
9304 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9305 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9306 "OMP atomic read expected a scalar type");
9307
9308 Value *XRead = nullptr;
9309
9310 if (XElemTy->isIntegerTy()) {
9311 LoadInst *XLD =
9312 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9313 XLD->setAtomic(AO);
9314 XRead = cast<Value>(XLD);
9315 } else if (XElemTy->isStructTy()) {
9316 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9317 // target does not support `atomicrmw` of the size of the struct
9318 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9319 OldVal->setAtomic(AO);
9320 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9321 unsigned LoadSize =
9322 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9323 OpenMPIRBuilder::AtomicInfo atomicInfo(
9324 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9325 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9326 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9327 XRead = AtomicLoadRes.first;
9328 OldVal->eraseFromParent();
9329 } else {
9330 // We need to perform atomic op as integer
9331 IntegerType *IntCastTy =
9333 LoadInst *XLoad =
9334 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9335 XLoad->setAtomic(AO);
9336 if (XElemTy->isFloatingPointTy()) {
9337 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9338 } else {
9339 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9340 }
9341 }
9342 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9343 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9344 return Builder.saveIP();
9345}
9346
9349 AtomicOpValue &X, Value *Expr,
9350 AtomicOrdering AO, InsertPointTy AllocaIP) {
9351 if (!updateToLocation(Loc))
9352 return Loc.IP;
9353
9354 assert(X.Var->getType()->isPointerTy() &&
9355 "OMP Atomic expects a pointer to target memory");
9356 Type *XElemTy = X.ElemTy;
9357 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9358 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9359 "OMP atomic write expected a scalar type");
9360
9361 if (XElemTy->isIntegerTy()) {
9362 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9363 XSt->setAtomic(AO);
9364 } else if (XElemTy->isStructTy()) {
9365 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9366 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9367 unsigned LoadSize =
9368 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9369 OpenMPIRBuilder::AtomicInfo atomicInfo(
9370 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9371 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9372 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9373 OldVal->eraseFromParent();
9374 } else {
9375 // We need to bitcast and perform atomic op as integers
9376 IntegerType *IntCastTy =
9378 Value *ExprCast =
9379 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9380 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9381 XSt->setAtomic(AO);
9382 }
9383
9384 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9385 return Builder.saveIP();
9386}
9387
9389 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9390 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9391 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9392 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9393 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9394 if (!updateToLocation(Loc))
9395 return Loc.IP;
9396
9397 LLVM_DEBUG({
9398 Type *XTy = X.Var->getType();
9399 assert(XTy->isPointerTy() &&
9400 "OMP Atomic expects a pointer to target memory");
9401 Type *XElemTy = X.ElemTy;
9402 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9403 XElemTy->isPointerTy()) &&
9404 "OMP atomic update expected a scalar type");
9405 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9406 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9407 "OpenMP atomic does not support LT or GT operations");
9408 });
9409
9410 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9411 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9412 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9413 if (!AtomicResult)
9414 return AtomicResult.takeError();
9415 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9416 return Builder.saveIP();
9417}
9418
9419// FIXME: Duplicating AtomicExpand
9420Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9421 AtomicRMWInst::BinOp RMWOp) {
9422 switch (RMWOp) {
9423 case AtomicRMWInst::Add:
9424 return Builder.CreateAdd(Src1, Src2);
9425 case AtomicRMWInst::Sub:
9426 return Builder.CreateSub(Src1, Src2);
9427 case AtomicRMWInst::And:
9428 return Builder.CreateAnd(Src1, Src2);
9430 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9431 case AtomicRMWInst::Or:
9432 return Builder.CreateOr(Src1, Src2);
9433 case AtomicRMWInst::Xor:
9434 return Builder.CreateXor(Src1, Src2);
9439 case AtomicRMWInst::Max:
9440 case AtomicRMWInst::Min:
9451 llvm_unreachable("Unsupported atomic update operation");
9452 }
9453 llvm_unreachable("Unsupported atomic update operation");
9454}
9455
9456Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9457 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9459 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9460 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9461 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9462 // or a complex datatype.
9463 bool emitRMWOp = false;
9464 switch (RMWOp) {
9465 case AtomicRMWInst::Add:
9466 case AtomicRMWInst::And:
9468 case AtomicRMWInst::Or:
9469 case AtomicRMWInst::Xor:
9471 emitRMWOp = XElemTy;
9472 break;
9473 case AtomicRMWInst::Sub:
9474 emitRMWOp = (IsXBinopExpr && XElemTy);
9475 break;
9476 default:
9477 emitRMWOp = false;
9478 }
9479 emitRMWOp &= XElemTy->isIntegerTy();
9480
9481 std::pair<Value *, Value *> Res;
9482 if (emitRMWOp) {
9483 AtomicRMWInst *RMWInst =
9484 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9485 if (T.isAMDGPU()) {
9486 if (IsIgnoreDenormalMode)
9487 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9489 if (!IsFineGrainedMemory)
9490 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9492 if (!IsRemoteMemory)
9493 RMWInst->setMetadata("amdgpu.no.remote.memory",
9495 }
9496 Res.first = RMWInst;
9497 // not needed except in case of postfix captures. Generate anyway for
9498 // consistency with the else part. Will be removed with any DCE pass.
9499 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9500 if (RMWOp == AtomicRMWInst::Xchg)
9501 Res.second = Res.first;
9502 else
9503 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9504 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9505 XElemTy->isStructTy()) {
9506 LoadInst *OldVal =
9507 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9508 OldVal->setAtomic(AO);
9509 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9510 unsigned LoadSize =
9511 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9512
9513 OpenMPIRBuilder::AtomicInfo atomicInfo(
9514 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9515 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9516 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9518 Instruction *CurBBTI = CurBB->getTerminator();
9519 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9520 BasicBlock *ExitBB =
9521 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9522 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9523 X->getName() + ".atomic.cont");
9524 ContBB->getTerminator()->eraseFromParent();
9525 Builder.restoreIP(AllocaIP);
9526 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9527 NewAtomicAddr->setName(X->getName() + "x.new.val");
9528 Builder.SetInsertPoint(ContBB);
9529 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9530 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9531 Value *OldExprVal = PHI;
9532 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9533 if (!CBResult)
9534 return CBResult.takeError();
9535 Value *Upd = *CBResult;
9536 Builder.CreateStore(Upd, NewAtomicAddr);
9539 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9540 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9541 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9542 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9543 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9544 OldVal->eraseFromParent();
9545 Res.first = OldExprVal;
9546 Res.second = Upd;
9547
9548 if (UnreachableInst *ExitTI =
9549 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9550 CurBBTI->eraseFromParent();
9551 Builder.SetInsertPoint(ExitBB);
9552 } else {
9553 Builder.SetInsertPoint(ExitTI);
9554 }
9555 } else {
9556 IntegerType *IntCastTy =
9558 LoadInst *OldVal =
9559 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9560 OldVal->setAtomic(AO);
9561 // CurBB
9562 // | /---\
9563 // ContBB |
9564 // | \---/
9565 // ExitBB
9567 Instruction *CurBBTI = CurBB->getTerminator();
9568 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9569 BasicBlock *ExitBB =
9570 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9571 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9572 X->getName() + ".atomic.cont");
9573 ContBB->getTerminator()->eraseFromParent();
9574 Builder.restoreIP(AllocaIP);
9575 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9576 NewAtomicAddr->setName(X->getName() + "x.new.val");
9577 Builder.SetInsertPoint(ContBB);
9578 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9579 PHI->addIncoming(OldVal, CurBB);
9580 bool IsIntTy = XElemTy->isIntegerTy();
9581 Value *OldExprVal = PHI;
9582 if (!IsIntTy) {
9583 if (XElemTy->isFloatingPointTy()) {
9584 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9585 X->getName() + ".atomic.fltCast");
9586 } else {
9587 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9588 X->getName() + ".atomic.ptrCast");
9589 }
9590 }
9591
9592 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9593 if (!CBResult)
9594 return CBResult.takeError();
9595 Value *Upd = *CBResult;
9596 Builder.CreateStore(Upd, NewAtomicAddr);
9597 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9601 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9602 Result->setVolatile(VolatileX);
9603 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9604 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9605 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9606 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9607
9608 Res.first = OldExprVal;
9609 Res.second = Upd;
9610
9611 // set Insertion point in exit block
9612 if (UnreachableInst *ExitTI =
9613 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9614 CurBBTI->eraseFromParent();
9615 Builder.SetInsertPoint(ExitBB);
9616 } else {
9617 Builder.SetInsertPoint(ExitTI);
9618 }
9619 }
9620
9621 return Res;
9622}
9623
9625 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9626 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9628 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9629 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9630 if (!updateToLocation(Loc))
9631 return Loc.IP;
9632
9633 LLVM_DEBUG({
9634 Type *XTy = X.Var->getType();
9635 assert(XTy->isPointerTy() &&
9636 "OMP Atomic expects a pointer to target memory");
9637 Type *XElemTy = X.ElemTy;
9638 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9639 XElemTy->isPointerTy()) &&
9640 "OMP atomic capture expected a scalar type");
9641 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9642 "OpenMP atomic does not support LT or GT operations");
9643 });
9644
9645 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9646 // 'x' is simply atomically rewritten with 'expr'.
9647 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9648 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9649 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9650 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9651 if (!AtomicResult)
9652 return AtomicResult.takeError();
9653 Value *CapturedVal =
9654 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9655 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9656
9657 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9658 return Builder.saveIP();
9659}
9660
9664 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9665 bool IsFailOnly) {
9666
9668 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9669 IsPostfixUpdate, IsFailOnly, Failure);
9670}
9671
9675 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9676 bool IsFailOnly, AtomicOrdering Failure) {
9677
9678 if (!updateToLocation(Loc))
9679 return Loc.IP;
9680
9681 assert(X.Var->getType()->isPointerTy() &&
9682 "OMP atomic expects a pointer to target memory");
9683 // compare capture
9684 if (V.Var) {
9685 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9686 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9687 }
9688
9689 bool IsInteger = E->getType()->isIntegerTy();
9690
9691 if (Op == OMPAtomicCompareOp::EQ) {
9692 AtomicCmpXchgInst *Result = nullptr;
9693 if (!IsInteger) {
9694 IntegerType *IntCastTy =
9695 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9696 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9697 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9698 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9699 AO, Failure);
9700 } else {
9701 Result =
9702 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9703 }
9704
9705 if (V.Var) {
9706 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9707 if (!IsInteger)
9708 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9709 assert(OldValue->getType() == V.ElemTy &&
9710 "OldValue and V must be of same type");
9711 if (IsPostfixUpdate) {
9712 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9713 } else {
9714 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9715 if (IsFailOnly) {
9716 // CurBB----
9717 // | |
9718 // v |
9719 // ContBB |
9720 // | |
9721 // v |
9722 // ExitBB <-
9723 //
9724 // where ContBB only contains the store of old value to 'v'.
9726 Instruction *CurBBTI = CurBB->getTerminator();
9727 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9728 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9729 CurBBTI, X.Var->getName() + ".atomic.exit");
9730 BasicBlock *ContBB = CurBB->splitBasicBlock(
9731 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9732 ContBB->getTerminator()->eraseFromParent();
9733 CurBB->getTerminator()->eraseFromParent();
9734
9735 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9736
9737 Builder.SetInsertPoint(ContBB);
9738 Builder.CreateStore(OldValue, V.Var);
9739 Builder.CreateBr(ExitBB);
9740
9741 if (UnreachableInst *ExitTI =
9742 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9743 CurBBTI->eraseFromParent();
9744 Builder.SetInsertPoint(ExitBB);
9745 } else {
9746 Builder.SetInsertPoint(ExitTI);
9747 }
9748 } else {
9749 Value *CapturedValue =
9750 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9751 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9752 }
9753 }
9754 }
9755 // The comparison result has to be stored.
9756 if (R.Var) {
9757 assert(R.Var->getType()->isPointerTy() &&
9758 "r.var must be of pointer type");
9759 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9760
9761 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9762 Value *ResultCast = R.IsSigned
9763 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9764 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9765 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9766 }
9767 } else {
9768 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9769 "Op should be either max or min at this point");
9770 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9771
9772 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9773 // Let's take max as example.
9774 // OpenMP form:
9775 // x = x > expr ? expr : x;
9776 // LLVM form:
9777 // *ptr = *ptr > val ? *ptr : val;
9778 // We need to transform to LLVM form.
9779 // x = x <= expr ? x : expr;
9781 if (IsXBinopExpr) {
9782 if (IsInteger) {
9783 if (X.IsSigned)
9784 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9786 else
9787 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9789 } else {
9790 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9792 }
9793 } else {
9794 if (IsInteger) {
9795 if (X.IsSigned)
9796 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9798 else
9799 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9801 } else {
9802 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9804 }
9805 }
9806
9807 AtomicRMWInst *OldValue =
9808 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9809 if (V.Var) {
9810 Value *CapturedValue = nullptr;
9811 if (IsPostfixUpdate) {
9812 CapturedValue = OldValue;
9813 } else {
9814 CmpInst::Predicate Pred;
9815 switch (NewOp) {
9816 case AtomicRMWInst::Max:
9817 Pred = CmpInst::ICMP_SGT;
9818 break;
9820 Pred = CmpInst::ICMP_UGT;
9821 break;
9823 Pred = CmpInst::FCMP_OGT;
9824 break;
9825 case AtomicRMWInst::Min:
9826 Pred = CmpInst::ICMP_SLT;
9827 break;
9829 Pred = CmpInst::ICMP_ULT;
9830 break;
9832 Pred = CmpInst::FCMP_OLT;
9833 break;
9834 default:
9835 llvm_unreachable("unexpected comparison op");
9836 }
9837 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9838 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9839 }
9840 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9841 }
9842 }
9843
9844 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9845
9846 return Builder.saveIP();
9847}
9848
9851 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9852 Value *NumTeamsUpper, Value *ThreadLimit,
9853 Value *IfExpr) {
9854 if (!updateToLocation(Loc))
9855 return InsertPointTy();
9856
9857 uint32_t SrcLocStrSize;
9858 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9859 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9860 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9861
9862 // Outer allocation basicblock is the entry block of the current function.
9863 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9864 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9865 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9866 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9867 }
9868
9869 // The current basic block is split into four basic blocks. After outlining,
9870 // they will be mapped as follows:
9871 // ```
9872 // def current_fn() {
9873 // current_basic_block:
9874 // br label %teams.exit
9875 // teams.exit:
9876 // ; instructions after teams
9877 // }
9878 //
9879 // def outlined_fn() {
9880 // teams.alloca:
9881 // br label %teams.body
9882 // teams.body:
9883 // ; instructions within teams body
9884 // }
9885 // ```
9886 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9887 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9888 BasicBlock *AllocaBB =
9889 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9890
9891 bool SubClausesPresent =
9892 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9893 // Push num_teams
9894 if (!Config.isTargetDevice() && SubClausesPresent) {
9895 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9896 "if lowerbound is non-null, then upperbound must also be non-null "
9897 "for bounds on num_teams");
9898
9899 if (NumTeamsUpper == nullptr)
9900 NumTeamsUpper = Builder.getInt32(0);
9901
9902 if (NumTeamsLower == nullptr)
9903 NumTeamsLower = NumTeamsUpper;
9904
9905 if (IfExpr) {
9906 assert(IfExpr->getType()->isIntegerTy() &&
9907 "argument to if clause must be an integer value");
9908
9909 // upper = ifexpr ? upper : 1
9910 if (IfExpr->getType() != Int1)
9911 IfExpr = Builder.CreateICmpNE(IfExpr,
9912 ConstantInt::get(IfExpr->getType(), 0));
9913 NumTeamsUpper = Builder.CreateSelect(
9914 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9915
9916 // lower = ifexpr ? lower : 1
9917 NumTeamsLower = Builder.CreateSelect(
9918 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9919 }
9920
9921 if (ThreadLimit == nullptr)
9922 ThreadLimit = Builder.getInt32(0);
9923
9924 Value *ThreadNum = getOrCreateThreadID(Ident);
9926 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9927 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9928 }
9929 // Generate the body of teams.
9930 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9931 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9932 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9933 return Err;
9934
9935 OutlineInfo OI;
9936 OI.EntryBB = AllocaBB;
9937 OI.ExitBB = ExitBB;
9938 OI.OuterAllocaBB = &OuterAllocaBB;
9939
9940 // Insert fake values for global tid and bound tid.
9942 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9944 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9946 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9947
9948 auto HostPostOutlineCB = [this, Ident,
9949 ToBeDeleted](Function &OutlinedFn) mutable {
9950 // The stale call instruction will be replaced with a new call instruction
9951 // for runtime call with the outlined function.
9952
9953 assert(OutlinedFn.hasOneUse() &&
9954 "there must be a single user for the outlined function");
9955 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9956 ToBeDeleted.push_back(StaleCI);
9957
9958 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9959 "Outlined function must have two or three arguments only");
9960
9961 bool HasShared = OutlinedFn.arg_size() == 3;
9962
9963 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9964 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9965 if (HasShared)
9966 OutlinedFn.getArg(2)->setName("data");
9967
9968 // Call to the runtime function for teams in the current function.
9969 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9970 "outlined function.");
9971 Builder.SetInsertPoint(StaleCI);
9972 SmallVector<Value *> Args = {
9973 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9974 if (HasShared)
9975 Args.push_back(StaleCI->getArgOperand(2));
9977 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9978 Args);
9979
9980 for (Instruction *I : llvm::reverse(ToBeDeleted))
9981 I->eraseFromParent();
9982 };
9983
9984 if (!Config.isTargetDevice())
9985 OI.PostOutlineCB = HostPostOutlineCB;
9986
9987 addOutlineInfo(std::move(OI));
9988
9989 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9990
9991 return Builder.saveIP();
9992}
9993
9996 InsertPointTy OuterAllocaIP,
9997 BodyGenCallbackTy BodyGenCB) {
9998 if (!updateToLocation(Loc))
9999 return InsertPointTy();
10000
10001 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10002
10003 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10004 BasicBlock *BodyBB =
10005 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10006 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10007 }
10008 BasicBlock *ExitBB =
10009 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10010 BasicBlock *BodyBB =
10011 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10012 BasicBlock *AllocaBB =
10013 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10014
10015 // Generate the body of distribute clause
10016 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10017 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10018 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10019 return Err;
10020
10021 OutlineInfo OI;
10022 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10023 OI.EntryBB = AllocaBB;
10024 OI.ExitBB = ExitBB;
10025
10026 addOutlineInfo(std::move(OI));
10027 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10028
10029 return Builder.saveIP();
10030}
10031
10034 std::string VarName) {
10035 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10037 Names.size()),
10038 Names);
10039 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10040 M, MapNamesArrayInit->getType(),
10041 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10042 VarName);
10043 return MapNamesArrayGlobal;
10044}
10045
10046// Create all simple and struct types exposed by the runtime and remember
10047// the llvm::PointerTypes of them for easy access later.
10048void OpenMPIRBuilder::initializeTypes(Module &M) {
10049 LLVMContext &Ctx = M.getContext();
10050 StructType *T;
10051#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10052#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10053 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10054 VarName##PtrTy = PointerType::getUnqual(Ctx);
10055#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10056 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10057 VarName##Ptr = PointerType::getUnqual(Ctx);
10058#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10059 T = StructType::getTypeByName(Ctx, StructName); \
10060 if (!T) \
10061 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10062 VarName = T; \
10063 VarName##Ptr = PointerType::getUnqual(Ctx);
10064#include "llvm/Frontend/OpenMP/OMPKinds.def"
10065}
10066
10069 SmallVectorImpl<BasicBlock *> &BlockVector) {
10072 BlockSet.insert(ExitBB);
10073
10074 Worklist.push_back(EntryBB);
10075 while (!Worklist.empty()) {
10076 BasicBlock *BB = Worklist.pop_back_val();
10077 BlockVector.push_back(BB);
10078 for (BasicBlock *SuccBB : successors(BB))
10079 if (BlockSet.insert(SuccBB).second)
10080 Worklist.push_back(SuccBB);
10081 }
10082}
10083
10085 uint64_t Size, int32_t Flags,
10087 StringRef Name) {
10088 if (!Config.isGPU()) {
10091 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10092 return;
10093 }
10094 // TODO: Add support for global variables on the device after declare target
10095 // support.
10096 Function *Fn = dyn_cast<Function>(Addr);
10097 if (!Fn)
10098 return;
10099
10100 // Add a function attribute for the kernel.
10101 Fn->addFnAttr("kernel");
10102 if (T.isAMDGCN())
10103 Fn->addFnAttr("uniform-work-group-size", "true");
10104 Fn->addFnAttr(Attribute::MustProgress);
10105}
10106
10107// We only generate metadata for function that contain target regions.
10110
10111 // If there are no entries, we don't need to do anything.
10113 return;
10114
10118 16>
10119 OrderedEntries(OffloadInfoManager.size());
10120
10121 // Auxiliary methods to create metadata values and strings.
10122 auto &&GetMDInt = [this](unsigned V) {
10123 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10124 };
10125
10126 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10127
10128 // Create the offloading info metadata node.
10129 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10130 auto &&TargetRegionMetadataEmitter =
10131 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10132 const TargetRegionEntryInfo &EntryInfo,
10134 // Generate metadata for target regions. Each entry of this metadata
10135 // contains:
10136 // - Entry 0 -> Kind of this type of metadata (0).
10137 // - Entry 1 -> Device ID of the file where the entry was identified.
10138 // - Entry 2 -> File ID of the file where the entry was identified.
10139 // - Entry 3 -> Mangled name of the function where the entry was
10140 // identified.
10141 // - Entry 4 -> Line in the file where the entry was identified.
10142 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10143 // - Entry 6 -> Order the entry was created.
10144 // The first element of the metadata node is the kind.
10145 Metadata *Ops[] = {
10146 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10147 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10148 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10149 GetMDInt(E.getOrder())};
10150
10151 // Save this entry in the right position of the ordered entries array.
10152 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10153
10154 // Add metadata to the named metadata node.
10155 MD->addOperand(MDNode::get(C, Ops));
10156 };
10157
10158 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10159
10160 // Create function that emits metadata for each device global variable entry;
10161 auto &&DeviceGlobalVarMetadataEmitter =
10162 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10163 StringRef MangledName,
10165 // Generate metadata for global variables. Each entry of this metadata
10166 // contains:
10167 // - Entry 0 -> Kind of this type of metadata (1).
10168 // - Entry 1 -> Mangled name of the variable.
10169 // - Entry 2 -> Declare target kind.
10170 // - Entry 3 -> Order the entry was created.
10171 // The first element of the metadata node is the kind.
10172 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10173 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10174
10175 // Save this entry in the right position of the ordered entries array.
10176 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10177 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10178
10179 // Add metadata to the named metadata node.
10180 MD->addOperand(MDNode::get(C, Ops));
10181 };
10182
10184 DeviceGlobalVarMetadataEmitter);
10185
10186 for (const auto &E : OrderedEntries) {
10187 assert(E.first && "All ordered entries must exist!");
10188 if (const auto *CE =
10189 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
10190 E.first)) {
10191 if (!CE->getID() || !CE->getAddress()) {
10192 // Do not blame the entry if the parent funtion is not emitted.
10193 TargetRegionEntryInfo EntryInfo = E.second;
10194 StringRef FnName = EntryInfo.ParentName;
10195 if (!M.getNamedValue(FnName))
10196 continue;
10197 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10198 continue;
10199 }
10200 createOffloadEntry(CE->getID(), CE->getAddress(),
10201 /*Size=*/0, CE->getFlags(),
10203 } else if (const auto *CE = dyn_cast<
10205 E.first)) {
10208 CE->getFlags());
10209 switch (Flags) {
10213 continue;
10214 if (!CE->getAddress()) {
10215 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10216 continue;
10217 }
10218 // The vaiable has no definition - no need to add the entry.
10219 if (CE->getVarSize() == 0)
10220 continue;
10221 break;
10223 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10224 (!Config.isTargetDevice() && CE->getAddress())) &&
10225 "Declaret target link address is set.");
10226 if (Config.isTargetDevice())
10227 continue;
10228 if (!CE->getAddress()) {
10230 continue;
10231 }
10232 break;
10233 default:
10234 break;
10235 }
10236
10237 // Hidden or internal symbols on the device are not externally visible.
10238 // We should not attempt to register them by creating an offloading
10239 // entry. Indirect variables are handled separately on the device.
10240 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10241 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10243 continue;
10244
10245 // Indirect globals need to use a special name that doesn't match the name
10246 // of the associated host global.
10248 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10249 Flags, CE->getLinkage(), CE->getVarName());
10250 else
10251 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10252 Flags, CE->getLinkage());
10253
10254 } else {
10255 llvm_unreachable("Unsupported entry kind.");
10256 }
10257 }
10258
10259 // Emit requires directive globals to a special entry so the runtime can
10260 // register them when the device image is loaded.
10261 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10262 // entries should be redesigned to better suit this use-case.
10267 ".requires", /*Size=*/0,
10270}
10271
10273 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10274 unsigned FileID, unsigned Line, unsigned Count) {
10276 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10277 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10278 if (Count)
10279 OS << "_" << Count;
10280}
10281
10284 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10286 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10287 EntryInfo.Line, NewCount);
10288}
10289
10292 StringRef ParentName) {
10293 sys::fs::UniqueID ID(0xdeadf17e, 0);
10294 auto FileIDInfo = CallBack();
10295 uint64_t FileID = 0;
10296 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10297 // If the inode ID could not be determined, create a hash value
10298 // the current file name and use that as an ID.
10299 if (EC)
10300 FileID = hash_value(std::get<0>(FileIDInfo));
10301 else
10302 FileID = ID.getFile();
10303
10304 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10305 std::get<1>(FileIDInfo));
10306}
10307
10309 unsigned Offset = 0;
10310 for (uint64_t Remain =
10311 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10313 !(Remain & 1); Remain = Remain >> 1)
10314 Offset++;
10315 return Offset;
10316}
10317
10320 // Rotate by getFlagMemberOffset() bits.
10321 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10322 << getFlagMemberOffset());
10323}
10324
10327 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10328 // If the entry is PTR_AND_OBJ but has not been marked with the special
10329 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10330 // marked as MEMBER_OF.
10331 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10333 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10336 return;
10337
10338 // Reset the placeholder value to prepare the flag for the assignment of the
10339 // proper MEMBER_OF value.
10340 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10341 Flags |= MemberOfFlag;
10342}
10343
10347 bool IsDeclaration, bool IsExternallyVisible,
10348 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10349 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10350 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10351 std::function<Constant *()> GlobalInitializer,
10352 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10353 // TODO: convert this to utilise the IRBuilder Config rather than
10354 // a passed down argument.
10355 if (OpenMPSIMD)
10356 return nullptr;
10357
10360 CaptureClause ==
10363 SmallString<64> PtrName;
10364 {
10365 raw_svector_ostream OS(PtrName);
10366 OS << MangledName;
10367 if (!IsExternallyVisible)
10368 OS << format("_%x", EntryInfo.FileID);
10369 OS << "_decl_tgt_ref_ptr";
10370 }
10371
10372 Value *Ptr = M.getNamedValue(PtrName);
10373
10374 if (!Ptr) {
10375 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10376 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10377
10378 auto *GV = cast<GlobalVariable>(Ptr);
10379 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10380
10381 if (!Config.isTargetDevice()) {
10382 if (GlobalInitializer)
10383 GV->setInitializer(GlobalInitializer());
10384 else
10385 GV->setInitializer(GlobalValue);
10386 }
10387
10389 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10390 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10391 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10392 }
10393
10394 return cast<Constant>(Ptr);
10395 }
10396
10397 return nullptr;
10398}
10399
10403 bool IsDeclaration, bool IsExternallyVisible,
10404 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10405 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10406 std::vector<Triple> TargetTriple,
10407 std::function<Constant *()> GlobalInitializer,
10408 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10409 Constant *Addr) {
10411 (TargetTriple.empty() && !Config.isTargetDevice()))
10412 return;
10413
10415 StringRef VarName;
10416 int64_t VarSize;
10418
10420 CaptureClause ==
10424 VarName = MangledName;
10425 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10426
10427 if (!IsDeclaration)
10428 VarSize = divideCeil(
10430 else
10431 VarSize = 0;
10432 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10433
10434 // This is a workaround carried over from Clang which prevents undesired
10435 // optimisation of internal variables.
10436 if (Config.isTargetDevice() &&
10437 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10438 // Do not create a "ref-variable" if the original is not also available
10439 // on the host.
10441 return;
10442
10443 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10444
10445 if (!M.getNamedValue(RefName)) {
10446 Constant *AddrRef =
10447 getOrCreateInternalVariable(Addr->getType(), RefName);
10448 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10449 GvAddrRef->setConstant(true);
10450 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10451 GvAddrRef->setInitializer(Addr);
10452 GeneratedRefs.push_back(GvAddrRef);
10453 }
10454 }
10455 } else {
10458 else
10460
10461 if (Config.isTargetDevice()) {
10462 VarName = (Addr) ? Addr->getName() : "";
10463 Addr = nullptr;
10464 } else {
10466 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10467 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10468 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10469 VarName = (Addr) ? Addr->getName() : "";
10470 }
10471 VarSize = M.getDataLayout().getPointerSize();
10473 }
10474
10476 Flags, Linkage);
10477}
10478
10479/// Loads all the offload entries information from the host IR
10480/// metadata.
10482 // If we are in target mode, load the metadata from the host IR. This code has
10483 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10484
10486 if (!MD)
10487 return;
10488
10489 for (MDNode *MN : MD->operands()) {
10490 auto &&GetMDInt = [MN](unsigned Idx) {
10491 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10492 return cast<ConstantInt>(V->getValue())->getZExtValue();
10493 };
10494
10495 auto &&GetMDString = [MN](unsigned Idx) {
10496 auto *V = cast<MDString>(MN->getOperand(Idx));
10497 return V->getString();
10498 };
10499
10500 switch (GetMDInt(0)) {
10501 default:
10502 llvm_unreachable("Unexpected metadata!");
10503 break;
10506 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10507 /*DeviceID=*/GetMDInt(1),
10508 /*FileID=*/GetMDInt(2),
10509 /*Line=*/GetMDInt(4),
10510 /*Count=*/GetMDInt(5));
10512 /*Order=*/GetMDInt(6));
10513 break;
10514 }
10518 /*MangledName=*/GetMDString(1),
10520 /*Flags=*/GetMDInt(2)),
10521 /*Order=*/GetMDInt(3));
10522 break;
10523 }
10524 }
10525}
10526
10528 if (HostFilePath.empty())
10529 return;
10530
10531 auto Buf = MemoryBuffer::getFile(HostFilePath);
10532 if (std::error_code Err = Buf.getError()) {
10533 report_fatal_error(("error opening host file from host file path inside of "
10534 "OpenMPIRBuilder: " +
10535 Err.message())
10536 .c_str());
10537 }
10538
10539 LLVMContext Ctx;
10541 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10542 if (std::error_code Err = M.getError()) {
10544 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10545 .c_str());
10546 }
10547
10548 loadOffloadInfoMetadata(*M.get());
10549}
10550
10551//===----------------------------------------------------------------------===//
10552// OffloadEntriesInfoManager
10553//===----------------------------------------------------------------------===//
10554
10556 return OffloadEntriesTargetRegion.empty() &&
10557 OffloadEntriesDeviceGlobalVar.empty();
10558}
10559
10560unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10561 const TargetRegionEntryInfo &EntryInfo) const {
10562 auto It = OffloadEntriesTargetRegionCount.find(
10563 getTargetRegionEntryCountKey(EntryInfo));
10564 if (It == OffloadEntriesTargetRegionCount.end())
10565 return 0;
10566 return It->second;
10567}
10568
10569void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10570 const TargetRegionEntryInfo &EntryInfo) {
10571 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10572 EntryInfo.Count + 1;
10573}
10574
10575/// Initialize target region entry.
10577 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10578 OffloadEntriesTargetRegion[EntryInfo] =
10579 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10580 OMPTargetRegionEntryTargetRegion);
10581 ++OffloadingEntriesNum;
10582}
10583
10587 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10588
10589 // Update the EntryInfo with the next available count for this location.
10590 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10591
10592 // If we are emitting code for a target, the entry is already initialized,
10593 // only has to be registered.
10594 if (OMPBuilder->Config.isTargetDevice()) {
10595 // This could happen if the device compilation is invoked standalone.
10596 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10597 return;
10598 }
10599 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10600 Entry.setAddress(Addr);
10601 Entry.setID(ID);
10602 Entry.setFlags(Flags);
10603 } else {
10605 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10606 return;
10607 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10608 "Target region entry already registered!");
10609 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10610 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10611 ++OffloadingEntriesNum;
10612 }
10613 incrementTargetRegionEntryInfoCount(EntryInfo);
10614}
10615
10617 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10618
10619 // Update the EntryInfo with the next available count for this location.
10620 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10621
10622 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10623 if (It == OffloadEntriesTargetRegion.end()) {
10624 return false;
10625 }
10626 // Fail if this entry is already registered.
10627 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10628 return false;
10629 return true;
10630}
10631
10633 const OffloadTargetRegionEntryInfoActTy &Action) {
10634 // Scan all target region entries and perform the provided action.
10635 for (const auto &It : OffloadEntriesTargetRegion) {
10636 Action(It.first, It.second);
10637 }
10638}
10639
10641 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10642 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10643 ++OffloadingEntriesNum;
10644}
10645
10647 StringRef VarName, Constant *Addr, int64_t VarSize,
10649 if (OMPBuilder->Config.isTargetDevice()) {
10650 // This could happen if the device compilation is invoked standalone.
10651 if (!hasDeviceGlobalVarEntryInfo(VarName))
10652 return;
10653 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10654 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10655 if (Entry.getVarSize() == 0) {
10656 Entry.setVarSize(VarSize);
10657 Entry.setLinkage(Linkage);
10658 }
10659 return;
10660 }
10661 Entry.setVarSize(VarSize);
10662 Entry.setLinkage(Linkage);
10663 Entry.setAddress(Addr);
10664 } else {
10665 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10666 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10667 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10668 "Entry not initialized!");
10669 if (Entry.getVarSize() == 0) {
10670 Entry.setVarSize(VarSize);
10671 Entry.setLinkage(Linkage);
10672 }
10673 return;
10674 }
10676 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10677 Addr, VarSize, Flags, Linkage,
10678 VarName.str());
10679 else
10680 OffloadEntriesDeviceGlobalVar.try_emplace(
10681 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10682 ++OffloadingEntriesNum;
10683 }
10684}
10685
10688 // Scan all target region entries and perform the provided action.
10689 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10690 Action(E.getKey(), E.getValue());
10691}
10692
10693//===----------------------------------------------------------------------===//
10694// CanonicalLoopInfo
10695//===----------------------------------------------------------------------===//
10696
10697void CanonicalLoopInfo::collectControlBlocks(
10699 // We only count those BBs as control block for which we do not need to
10700 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10701 // flow. For consistency, this also means we do not add the Body block, which
10702 // is just the entry to the body code.
10703 BBs.reserve(BBs.size() + 6);
10704 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10705}
10706
10708 assert(isValid() && "Requires a valid canonical loop");
10709 for (BasicBlock *Pred : predecessors(Header)) {
10710 if (Pred != Latch)
10711 return Pred;
10712 }
10713 llvm_unreachable("Missing preheader");
10714}
10715
10716void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10717 assert(isValid() && "Requires a valid canonical loop");
10718
10719 Instruction *CmpI = &getCond()->front();
10720 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10721 CmpI->setOperand(1, TripCount);
10722
10723#ifndef NDEBUG
10724 assertOK();
10725#endif
10726}
10727
10728void CanonicalLoopInfo::mapIndVar(
10729 llvm::function_ref<Value *(Instruction *)> Updater) {
10730 assert(isValid() && "Requires a valid canonical loop");
10731
10732 Instruction *OldIV = getIndVar();
10733
10734 // Record all uses excluding those introduced by the updater. Uses by the
10735 // CanonicalLoopInfo itself to keep track of the number of iterations are
10736 // excluded.
10737 SmallVector<Use *> ReplacableUses;
10738 for (Use &U : OldIV->uses()) {
10739 auto *User = dyn_cast<Instruction>(U.getUser());
10740 if (!User)
10741 continue;
10742 if (User->getParent() == getCond())
10743 continue;
10744 if (User->getParent() == getLatch())
10745 continue;
10746 ReplacableUses.push_back(&U);
10747 }
10748
10749 // Run the updater that may introduce new uses
10750 Value *NewIV = Updater(OldIV);
10751
10752 // Replace the old uses with the value returned by the updater.
10753 for (Use *U : ReplacableUses)
10754 U->set(NewIV);
10755
10756#ifndef NDEBUG
10757 assertOK();
10758#endif
10759}
10760
10762#ifndef NDEBUG
10763 // No constraints if this object currently does not describe a loop.
10764 if (!isValid())
10765 return;
10766
10767 BasicBlock *Preheader = getPreheader();
10768 BasicBlock *Body = getBody();
10769 BasicBlock *After = getAfter();
10770
10771 // Verify standard control-flow we use for OpenMP loops.
10772 assert(Preheader);
10773 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10774 "Preheader must terminate with unconditional branch");
10775 assert(Preheader->getSingleSuccessor() == Header &&
10776 "Preheader must jump to header");
10777
10778 assert(Header);
10779 assert(isa<BranchInst>(Header->getTerminator()) &&
10780 "Header must terminate with unconditional branch");
10781 assert(Header->getSingleSuccessor() == Cond &&
10782 "Header must jump to exiting block");
10783
10784 assert(Cond);
10785 assert(Cond->getSinglePredecessor() == Header &&
10786 "Exiting block only reachable from header");
10787
10788 assert(isa<BranchInst>(Cond->getTerminator()) &&
10789 "Exiting block must terminate with conditional branch");
10790 assert(size(successors(Cond)) == 2 &&
10791 "Exiting block must have two successors");
10792 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10793 "Exiting block's first successor jump to the body");
10794 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10795 "Exiting block's second successor must exit the loop");
10796
10797 assert(Body);
10798 assert(Body->getSinglePredecessor() == Cond &&
10799 "Body only reachable from exiting block");
10800 assert(!isa<PHINode>(Body->front()));
10801
10802 assert(Latch);
10803 assert(isa<BranchInst>(Latch->getTerminator()) &&
10804 "Latch must terminate with unconditional branch");
10805 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10806 // TODO: To support simple redirecting of the end of the body code that has
10807 // multiple; introduce another auxiliary basic block like preheader and after.
10808 assert(Latch->getSinglePredecessor() != nullptr);
10809 assert(!isa<PHINode>(Latch->front()));
10810
10811 assert(Exit);
10812 assert(isa<BranchInst>(Exit->getTerminator()) &&
10813 "Exit block must terminate with unconditional branch");
10814 assert(Exit->getSingleSuccessor() == After &&
10815 "Exit block must jump to after block");
10816
10817 assert(After);
10818 assert(After->getSinglePredecessor() == Exit &&
10819 "After block only reachable from exit block");
10820 assert(After->empty() || !isa<PHINode>(After->front()));
10821
10822 Instruction *IndVar = getIndVar();
10823 assert(IndVar && "Canonical induction variable not found?");
10824 assert(isa<IntegerType>(IndVar->getType()) &&
10825 "Induction variable must be an integer");
10826 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10827 "Induction variable must be a PHI in the loop header");
10828 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10829 assert(
10830 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10831 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10832
10833 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10834 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10835 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10836 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10837 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10838 ->isOne());
10839
10840 Value *TripCount = getTripCount();
10841 assert(TripCount && "Loop trip count not found?");
10842 assert(IndVar->getType() == TripCount->getType() &&
10843 "Trip count and induction variable must have the same type");
10844
10845 auto *CmpI = cast<CmpInst>(&Cond->front());
10846 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10847 "Exit condition must be a signed less-than comparison");
10848 assert(CmpI->getOperand(0) == IndVar &&
10849 "Exit condition must compare the induction variable");
10850 assert(CmpI->getOperand(1) == TripCount &&
10851 "Exit condition must compare with the trip count");
10852#endif
10853}
10854
10856 Header = nullptr;
10857 Cond = nullptr;
10858 Latch = nullptr;
10859 Exit = nullptr;
10860}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:546
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:64
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:128
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:101
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:121
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:106
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:132
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:97
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:473
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition: Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
Class to represent array types.
Definition: DerivedTypes.h:398
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:657
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition: Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:777
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
Definition: Instructions.h:765
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:781
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
Definition: Instructions.h:761
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
LLVM_ABI AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
LLVM_ABI AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
LLVM_ABI AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:615
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:944
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:929
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:646
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:393
reverse_iterator rbegin()
Definition: BasicBlock.h:475
bool empty() const
Definition: BasicBlock.h:481
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:337
const Instruction & front() const
Definition: BasicBlock.h:482
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:354
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:555
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:475
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:437
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:445
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:467
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:235
reverse_iterator rend()
Definition: BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:131
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:662
const Instruction & back() const
Definition: BasicBlock.h:484
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:248
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:494
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1956
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1267
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1273
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:47
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:87
LLVM_ABI void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
LLVM_ABI Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
LLVM_ABI void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
LLVM_ABI bool isEligible() const
Test whether this code extractor is eligible.
LLVM_ABI void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1314
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2989
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2246
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
Definition: Constants.cpp:2240
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2261
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
Definition: Constants.cpp:2489
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2340
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1833
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1380
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
Debug location.
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:248
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:230
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI unsigned getPointerSize(unsigned AS=0) const
The pointer representation size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:738
unsigned getIndexSizeInBits(unsigned AS) const
The size in bits of indices used for address calculation in getelementptr and for addresses in the gi...
Definition: DataLayout.h:398
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition: DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:384
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition: Error.h:159
static ErrorSuccess success()
Create a success value.
Definition: Error.h:336
Tagged union holding either a T or a Error.
Definition: Error.h:485
Error takeError()
Take ownership of the stored error.
Definition: Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:166
const BasicBlock & getEntryBlock() const
Definition: Function.h:807
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
const Function & getFunction() const
Definition: Function.h:164
iterator begin()
Definition: Function.h:851
arg_iterator arg_begin()
Definition: Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:753
size_t arg_size() const
Definition: Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:214
iterator end()
Definition: Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:274
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1605
LinkageTypes getLinkage() const
Definition: GlobalValue.h:548
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:539
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
void setDSOLocal(bool Local)
Definition: GlobalValue.h:305
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:296
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:70
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:256
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:56
Type * getValueType() const
Definition: GlobalValue.h:298
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:291
BasicBlock * getBlock() const
Definition: IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:497
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1450
LLVM_ABI Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1027
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2345
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1898
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1830
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2353
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:595
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2100
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:687
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1339
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2251
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
LLVM_ABI CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1240
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2128
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:2029
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:617
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2142
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1412
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:567
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2263
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1454
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2333
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1416
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:557
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1781
LLVM_ABI CallInst * CreateMalloc(Type *IntPtrTy, Type *AllocTy, Value *AllocSize, Value *ArraySize, ArrayRef< OperandBundleDef > OpB, Function *MallocF=nullptr, const Twine &Name="")
Definition: IRBuilder.cpp:259
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:311
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
LLVM_ABI CallInst * CreateFree(Value *Source, ArrayRef< OperandBundleDef > Bundles={})
Generate the IR for a call to the builtin free function.
Definition: IRBuilder.cpp:311
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1220
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2329
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:172
LLVM_ABI DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:63
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:533
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1197
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
LLVMContext & getContext() const
Definition: IRBuilder.h:203
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1551
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1167
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1970
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:2016
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1860
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1463
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2651
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1911
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1191
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:196
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2361
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:517
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2341
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:323
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2646
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:600
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1599
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:552
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1480
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2115
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2209
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
LLVM_ABI GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:43
Value * CreateNUWSub(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1433
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:90
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:78
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:428
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1718
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:510
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Value * getPointerOperand()
Definition: Instructions.h:259
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:981
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:119
Metadata node.
Definition: Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1078
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1573
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1443
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:56
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:281
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:295
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:285
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:229
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:269
iterator_range< global_iterator > globals()
Definition: Module.h:684
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:596
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:430
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:171
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:302
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:445
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:278
A tuple of MDNodes.
Definition: Metadata.h:1753
iterator_range< op_iterator > operands()
Definition: Metadata.h:1849
LLVM_ABI void addOperand(MDNode *M)
Definition: Metadata.cpp:1471
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:255
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:257
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:390
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:392
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:308
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:310
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:299
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:370
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:376
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:382
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:380
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:374
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:372
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:444
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:104
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:200
StringRef separator() const
Definition: OMPIRBuilder.h:186
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:176
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:117
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:159
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
LLVM_ABI bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:153
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:196
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:485
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:560
LLVM_ABI void emitBranch(BasicBlock *Target)
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false)
Generator for '#omp target'.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for '#omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for '#omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:536
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:539
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:700
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition: SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:247
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:499
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
void setAlignment(Align Align)
Definition: Instructions.h:342
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:369
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:257
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:710
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:461
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:626
Class to represent struct types.
Definition: DerivedTypes.h:218
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:620
Type * getElementType(unsigned N) const
Definition: DerivedTypes.h:369
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:1037
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1099
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:408
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1109
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:261
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
LLVM_ABI Type * getStructElementType(unsigned N) const
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:132
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:148
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
User * user_back()
Definition: Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:953
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:188
bool use_empty() const
Definition: Value.h:346
user_iterator user_end()
Definition: Value.h:410
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ Exit
Definition: COFF.h:863
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
Definition: CallingConv.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:198
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:255
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:286
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:270
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:870
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:68
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:1128
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:22
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:662
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:214
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61