LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
60
61#include <cstdint>
62#include <optional>
63
64#define DEBUG_TYPE "openmp-ir-builder"
65
66using namespace llvm;
67using namespace omp;
68
69static cl::opt<bool>
70 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
71 cl::desc("Use optimistic attributes describing "
72 "'as-if' properties of runtime calls."),
73 cl::init(false));
74
76 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
77 cl::desc("Factor for the unroll threshold to account for code "
78 "simplifications still taking place"),
79 cl::init(1.5));
80
81#ifndef NDEBUG
82/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
83/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
84/// an InsertPoint stores the instruction before something is inserted. For
85/// instance, if both point to the same instruction, two IRBuilders alternating
86/// creating instruction will cause the instructions to be interleaved.
89 if (!IP1.isSet() || !IP2.isSet())
90 return false;
91 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
92}
93
95 // Valid ordered/unordered and base algorithm combinations.
96 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
97 case OMPScheduleType::UnorderedStaticChunked:
98 case OMPScheduleType::UnorderedStatic:
99 case OMPScheduleType::UnorderedDynamicChunked:
100 case OMPScheduleType::UnorderedGuidedChunked:
101 case OMPScheduleType::UnorderedRuntime:
102 case OMPScheduleType::UnorderedAuto:
103 case OMPScheduleType::UnorderedTrapezoidal:
104 case OMPScheduleType::UnorderedGreedy:
105 case OMPScheduleType::UnorderedBalanced:
106 case OMPScheduleType::UnorderedGuidedIterativeChunked:
107 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
108 case OMPScheduleType::UnorderedSteal:
109 case OMPScheduleType::UnorderedStaticBalancedChunked:
110 case OMPScheduleType::UnorderedGuidedSimd:
111 case OMPScheduleType::UnorderedRuntimeSimd:
112 case OMPScheduleType::OrderedStaticChunked:
113 case OMPScheduleType::OrderedStatic:
114 case OMPScheduleType::OrderedDynamicChunked:
115 case OMPScheduleType::OrderedGuidedChunked:
116 case OMPScheduleType::OrderedRuntime:
117 case OMPScheduleType::OrderedAuto:
118 case OMPScheduleType::OrderdTrapezoidal:
119 case OMPScheduleType::NomergeUnorderedStaticChunked:
120 case OMPScheduleType::NomergeUnorderedStatic:
121 case OMPScheduleType::NomergeUnorderedDynamicChunked:
122 case OMPScheduleType::NomergeUnorderedGuidedChunked:
123 case OMPScheduleType::NomergeUnorderedRuntime:
124 case OMPScheduleType::NomergeUnorderedAuto:
125 case OMPScheduleType::NomergeUnorderedTrapezoidal:
126 case OMPScheduleType::NomergeUnorderedGreedy:
127 case OMPScheduleType::NomergeUnorderedBalanced:
128 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
129 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
130 case OMPScheduleType::NomergeUnorderedSteal:
131 case OMPScheduleType::NomergeOrderedStaticChunked:
132 case OMPScheduleType::NomergeOrderedStatic:
133 case OMPScheduleType::NomergeOrderedDynamicChunked:
134 case OMPScheduleType::NomergeOrderedGuidedChunked:
135 case OMPScheduleType::NomergeOrderedRuntime:
136 case OMPScheduleType::NomergeOrderedAuto:
137 case OMPScheduleType::NomergeOrderedTrapezoidal:
138 break;
139 default:
140 return false;
141 }
142
143 // Must not set both monotonicity modifiers at the same time.
144 OMPScheduleType MonotonicityFlags =
145 SchedType & OMPScheduleType::MonotonicityMask;
146 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
147 return false;
148
149 return true;
150}
151#endif
152
153/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
154/// debug location to the last instruction in the specified basic block if the
155/// insert point points to the end of the block.
158 Builder.restoreIP(IP);
159 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 if (!BB->empty() && I == BB->end())
163}
164
165static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
166 if (T.isAMDGPU()) {
167 StringRef Features =
168 Kernel->getFnAttribute("target-features").getValueAsString();
169 if (Features.count("+wavefrontsize64"))
170 return omp::getAMDGPUGridValues<64>();
171 return omp::getAMDGPUGridValues<32>();
172 }
173 if (T.isNVPTX())
175 if (T.isSPIRV())
177 llvm_unreachable("No grid value available for this architecture!");
178}
179
180/// Determine which scheduling algorithm to use, determined from schedule clause
181/// arguments.
182static OMPScheduleType
183getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
184 bool HasSimdModifier) {
185 // Currently, the default schedule it static.
186 switch (ClauseKind) {
187 case OMP_SCHEDULE_Default:
188 case OMP_SCHEDULE_Static:
189 return HasChunks ? OMPScheduleType::BaseStaticChunked
190 : OMPScheduleType::BaseStatic;
191 case OMP_SCHEDULE_Dynamic:
192 return OMPScheduleType::BaseDynamicChunked;
193 case OMP_SCHEDULE_Guided:
194 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
195 : OMPScheduleType::BaseGuidedChunked;
196 case OMP_SCHEDULE_Auto:
198 case OMP_SCHEDULE_Runtime:
199 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
200 : OMPScheduleType::BaseRuntime;
201 }
202 llvm_unreachable("unhandled schedule clause argument");
203}
204
205/// Adds ordering modifier flags to schedule type.
206static OMPScheduleType
208 bool HasOrderedClause) {
209 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
210 OMPScheduleType::None &&
211 "Must not have ordering nor monotonicity flags already set");
212
213 OMPScheduleType OrderingModifier = HasOrderedClause
214 ? OMPScheduleType::ModifierOrdered
215 : OMPScheduleType::ModifierUnordered;
216 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
217
218 // Unsupported combinations
219 if (OrderingScheduleType ==
220 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
221 return OMPScheduleType::OrderedGuidedChunked;
222 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
223 OMPScheduleType::ModifierOrdered))
224 return OMPScheduleType::OrderedRuntime;
225
226 return OrderingScheduleType;
227}
228
229/// Adds monotonicity modifier flags to schedule type.
230static OMPScheduleType
232 bool HasSimdModifier, bool HasMonotonic,
233 bool HasNonmonotonic, bool HasOrderedClause) {
234 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
235 OMPScheduleType::None &&
236 "Must not have monotonicity flags already set");
237 assert((!HasMonotonic || !HasNonmonotonic) &&
238 "Monotonic and Nonmonotonic are contradicting each other");
239
240 if (HasMonotonic) {
241 return ScheduleType | OMPScheduleType::ModifierMonotonic;
242 } else if (HasNonmonotonic) {
243 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
244 } else {
245 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
246 // If the static schedule kind is specified or if the ordered clause is
247 // specified, and if the nonmonotonic modifier is not specified, the
248 // effect is as if the monotonic modifier is specified. Otherwise, unless
249 // the monotonic modifier is specified, the effect is as if the
250 // nonmonotonic modifier is specified.
251 OMPScheduleType BaseScheduleType =
252 ScheduleType & ~OMPScheduleType::ModifierMask;
253 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
254 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
255 HasOrderedClause) {
256 // The monotonic is used by default in openmp runtime library, so no need
257 // to set it.
258 return ScheduleType;
259 } else {
260 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
261 }
262 }
263}
264
265/// Determine the schedule type using schedule and ordering clause arguments.
266static OMPScheduleType
267computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
268 bool HasSimdModifier, bool HasMonotonicModifier,
269 bool HasNonmonotonicModifier, bool HasOrderedClause) {
270 OMPScheduleType BaseSchedule =
271 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
272 OMPScheduleType OrderedSchedule =
273 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
275 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
276 HasNonmonotonicModifier, HasOrderedClause);
277
279 return Result;
280}
281
282/// Make \p Source branch to \p Target.
283///
284/// Handles two situations:
285/// * \p Source already has an unconditional branch.
286/// * \p Source is a degenerate block (no terminator because the BB is
287/// the current head of the IR construction).
289 if (Instruction *Term = Source->getTerminator()) {
290 auto *Br = cast<BranchInst>(Term);
291 assert(!Br->isConditional() &&
292 "BB's terminator must be an unconditional branch (or degenerate)");
293 BasicBlock *Succ = Br->getSuccessor(0);
294 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
295 Br->setSuccessor(0, Target);
296 return;
297 }
298
299 auto *NewBr = BranchInst::Create(Target, Source);
300 NewBr->setDebugLoc(DL);
301}
302
304 bool CreateBranch, DebugLoc DL) {
305 assert(New->getFirstInsertionPt() == New->begin() &&
306 "Target BB must not have PHI nodes");
307
308 // Move instructions to new block.
309 BasicBlock *Old = IP.getBlock();
310 // If the `Old` block is empty then there are no instructions to move. But in
311 // the new debug scheme, it could have trailing debug records which will be
312 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
313 // reasons:
314 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
315 // 2. Even if `New` is not empty, the rationale to move those records to `New`
316 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
317 // assumes that `Old` is optimized out and is going away. This is not the case
318 // here. The `Old` block is still being used e.g. a branch instruction is
319 // added to it later in this function.
320 // So we call `BasicBlock::splice` only when `Old` is not empty.
321 if (!Old->empty())
322 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
323
324 if (CreateBranch) {
325 auto *NewBr = BranchInst::Create(New, Old);
326 NewBr->setDebugLoc(DL);
327 }
328}
329
330void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 BasicBlock *Old = Builder.GetInsertBlock();
333
334 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
335 if (CreateBranch)
336 Builder.SetInsertPoint(Old->getTerminator());
337 else
338 Builder.SetInsertPoint(Old);
339
340 // SetInsertPoint also updates the Builder's debug location, but we want to
341 // keep the one the Builder was configured to use.
343}
344
347 BasicBlock *Old = IP.getBlock();
349 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
350 Old->getParent(), Old->getNextNode());
351 spliceBB(IP, New, CreateBranch, DL);
352 New->replaceSuccessorsPhiUsesWith(Old, New);
353 return New;
354}
355
356BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
359 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
360 if (CreateBranch)
361 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
362 else
363 Builder.SetInsertPoint(Builder.GetInsertBlock());
364 // SetInsertPoint also updates the Builder's debug location, but we want to
365 // keep the one the Builder was configured to use.
367 return New;
368}
369
370BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
373 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
374 if (CreateBranch)
375 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
376 else
377 Builder.SetInsertPoint(Builder.GetInsertBlock());
378 // SetInsertPoint also updates the Builder's debug location, but we want to
379 // keep the one the Builder was configured to use.
381 return New;
382}
383
385 llvm::Twine Suffix) {
386 BasicBlock *Old = Builder.GetInsertBlock();
387 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
388}
389
390// This function creates a fake integer value and a fake use for the integer
391// value. It returns the fake value created. This is useful in modeling the
392// extra arguments to the outlined functions.
394 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
396 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
397 const Twine &Name = "", bool AsPtr = true) {
398 Builder.restoreIP(OuterAllocaIP);
399 Instruction *FakeVal;
400 AllocaInst *FakeValAddr =
401 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
402 ToBeDeleted.push_back(FakeValAddr);
403
404 if (AsPtr) {
405 FakeVal = FakeValAddr;
406 } else {
407 FakeVal =
408 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
409 ToBeDeleted.push_back(FakeVal);
410 }
411
412 // Generate a fake use of this value
413 Builder.restoreIP(InnerAllocaIP);
414 Instruction *UseFakeVal;
415 if (AsPtr) {
416 UseFakeVal =
417 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
418 } else {
419 UseFakeVal =
420 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
421 }
422 ToBeDeleted.push_back(UseFakeVal);
423 return FakeVal;
424}
425
426//===----------------------------------------------------------------------===//
427// OpenMPIRBuilderConfig
428//===----------------------------------------------------------------------===//
429
430namespace {
432/// Values for bit flags for marking which requires clauses have been used.
433enum OpenMPOffloadingRequiresDirFlags {
434 /// flag undefined.
435 OMP_REQ_UNDEFINED = 0x000,
436 /// no requires directive present.
437 OMP_REQ_NONE = 0x001,
438 /// reverse_offload clause.
439 OMP_REQ_REVERSE_OFFLOAD = 0x002,
440 /// unified_address clause.
441 OMP_REQ_UNIFIED_ADDRESS = 0x004,
442 /// unified_shared_memory clause.
443 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
444 /// dynamic_allocators clause.
445 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
446 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
447};
448
449} // anonymous namespace
450
452 : RequiresFlags(OMP_REQ_UNDEFINED) {}
453
455 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
456 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
457 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
458 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
459 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
460 RequiresFlags(OMP_REQ_UNDEFINED) {
461 if (HasRequiresReverseOffload)
462 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
463 if (HasRequiresUnifiedAddress)
464 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
465 if (HasRequiresUnifiedSharedMemory)
466 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
467 if (HasRequiresDynamicAllocators)
468 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
469}
470
472 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
473}
474
476 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
477}
478
480 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
485}
486
488 return hasRequiresFlags() ? RequiresFlags
489 : static_cast<int64_t>(OMP_REQ_NONE);
490}
491
493 if (Value)
494 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
495 else
496 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
497}
498
500 if (Value)
501 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
502 else
503 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
504}
505
507 if (Value)
508 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
509 else
510 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
511}
512
514 if (Value)
515 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
516 else
517 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
518}
519
520//===----------------------------------------------------------------------===//
521// OpenMPIRBuilder
522//===----------------------------------------------------------------------===//
523
525 IRBuilderBase &Builder,
526 SmallVector<Value *> &ArgsVector) {
528 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
529 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
530 constexpr const size_t MaxDim = 3;
531 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
532 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
533
534 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
535
536 Value *NumTeams3D =
537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
538 Value *NumThreads3D =
539 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
540 for (unsigned I :
541 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
542 NumTeams3D =
543 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
544 for (unsigned I :
545 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
546 NumThreads3D =
547 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
548
549 ArgsVector = {Version,
550 PointerNum,
551 KernelArgs.RTArgs.BasePointersArray,
552 KernelArgs.RTArgs.PointersArray,
553 KernelArgs.RTArgs.SizesArray,
554 KernelArgs.RTArgs.MapTypesArray,
555 KernelArgs.RTArgs.MapNamesArray,
556 KernelArgs.RTArgs.MappersArray,
557 KernelArgs.NumIterations,
558 Flags,
559 NumTeams3D,
560 NumThreads3D,
561 KernelArgs.DynCGGroupMem};
562}
563
565 LLVMContext &Ctx = Fn.getContext();
566
567 // Get the function's current attributes.
568 auto Attrs = Fn.getAttributes();
569 auto FnAttrs = Attrs.getFnAttrs();
570 auto RetAttrs = Attrs.getRetAttrs();
572 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
573 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
574
575 // Add AS to FnAS while taking special care with integer extensions.
576 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
577 bool Param = true) -> void {
578 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
579 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
580 if (HasSignExt || HasZeroExt) {
581 assert(AS.getNumAttributes() == 1 &&
582 "Currently not handling extension attr combined with others.");
583 if (Param) {
584 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
585 FnAS = FnAS.addAttribute(Ctx, AK);
586 } else if (auto AK =
587 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
588 FnAS = FnAS.addAttribute(Ctx, AK);
589 } else {
590 FnAS = FnAS.addAttributes(Ctx, AS);
591 }
592 };
593
594#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
595#include "llvm/Frontend/OpenMP/OMPKinds.def"
596
597 // Add attributes to the function declaration.
598 switch (FnID) {
599#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
600 case Enum: \
601 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
602 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
603 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
604 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
605 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
606 break;
607#include "llvm/Frontend/OpenMP/OMPKinds.def"
608 default:
609 // Attributes are optional.
610 break;
611 }
612}
613
616 FunctionType *FnTy = nullptr;
617 Function *Fn = nullptr;
618
619 // Try to find the declation in the module first.
620 switch (FnID) {
621#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
622 case Enum: \
623 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
624 IsVarArg); \
625 Fn = M.getFunction(Str); \
626 break;
627#include "llvm/Frontend/OpenMP/OMPKinds.def"
628 }
629
630 if (!Fn) {
631 // Create a new declaration if we need one.
632 switch (FnID) {
633#define OMP_RTL(Enum, Str, ...) \
634 case Enum: \
635 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
636 break;
637#include "llvm/Frontend/OpenMP/OMPKinds.def"
638 }
639
640 // Add information if the runtime function takes a callback function
641 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
642 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
643 LLVMContext &Ctx = Fn->getContext();
644 MDBuilder MDB(Ctx);
645 // Annotate the callback behavior of the runtime function:
646 // - The callback callee is argument number 2 (microtask).
647 // - The first two arguments of the callback callee are unknown (-1).
648 // - All variadic arguments to the runtime function are passed to the
649 // callback callee.
650 Fn->addMetadata(
651 LLVMContext::MD_callback,
653 2, {-1, -1}, /* VarArgsArePassed */ true)}));
654 }
655 }
656
657 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
658 << " with type " << *Fn->getFunctionType() << "\n");
659 addAttributes(FnID, *Fn);
660
661 } else {
662 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
663 << " with type " << *Fn->getFunctionType() << "\n");
664 }
665
666 assert(Fn && "Failed to create OpenMP runtime function");
667
668 return {FnTy, Fn};
669}
670
673 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
674 assert(Fn && "Failed to create OpenMP runtime function pointer");
675 return Fn;
676}
677
678void OpenMPIRBuilder::initialize() { initializeTypes(M); }
679
682 BasicBlock &EntryBlock = Function->getEntryBlock();
683 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
684
685 // Loop over blocks looking for constant allocas, skipping the entry block
686 // as any allocas there are already in the desired location.
687 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
688 Block++) {
689 for (auto Inst = Block->getReverseIterator()->begin();
690 Inst != Block->getReverseIterator()->end();) {
691 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
692 Inst++;
693 if (!isa<ConstantData>(AllocaInst->getArraySize()))
694 continue;
695 AllocaInst->moveBeforePreserving(MoveLocInst);
696 } else {
697 Inst++;
698 }
699 }
700 }
701}
702
704 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
706 SmallVector<OutlineInfo, 16> DeferredOutlines;
707 for (OutlineInfo &OI : OutlineInfos) {
708 // Skip functions that have not finalized yet; may happen with nested
709 // function generation.
710 if (Fn && OI.getFunction() != Fn) {
711 DeferredOutlines.push_back(OI);
712 continue;
713 }
714
715 ParallelRegionBlockSet.clear();
716 Blocks.clear();
717 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
718
719 Function *OuterFn = OI.getFunction();
720 CodeExtractorAnalysisCache CEAC(*OuterFn);
721 // If we generate code for the target device, we need to allocate
722 // struct for aggregate params in the device default alloca address space.
723 // OpenMP runtime requires that the params of the extracted functions are
724 // passed as zero address space pointers. This flag ensures that
725 // CodeExtractor generates correct code for extracted functions
726 // which are used by OpenMP runtime.
727 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
728 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
729 /* AggregateArgs */ true,
730 /* BlockFrequencyInfo */ nullptr,
731 /* BranchProbabilityInfo */ nullptr,
732 /* AssumptionCache */ nullptr,
733 /* AllowVarArgs */ true,
734 /* AllowAlloca */ true,
735 /* AllocaBlock*/ OI.OuterAllocaBB,
736 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
737
738 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
739 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
740 << " Exit: " << OI.ExitBB->getName() << "\n");
741 assert(Extractor.isEligible() &&
742 "Expected OpenMP outlining to be possible!");
743
744 for (auto *V : OI.ExcludeArgsFromAggregate)
745 Extractor.excludeArgFromAggregate(V);
746
747 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
748
749 // Forward target-cpu, target-features attributes to the outlined function.
750 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
751 if (TargetCpuAttr.isStringAttribute())
752 OutlinedFn->addFnAttr(TargetCpuAttr);
753
754 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
755 if (TargetFeaturesAttr.isStringAttribute())
756 OutlinedFn->addFnAttr(TargetFeaturesAttr);
757
758 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
759 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
760 assert(OutlinedFn->getReturnType()->isVoidTy() &&
761 "OpenMP outlined functions should not return a value!");
762
763 // For compability with the clang CG we move the outlined function after the
764 // one with the parallel region.
765 OutlinedFn->removeFromParent();
766 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
767
768 // Remove the artificial entry introduced by the extractor right away, we
769 // made our own entry block after all.
770 {
771 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
772 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
773 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
774 // Move instructions from the to-be-deleted ArtificialEntry to the entry
775 // basic block of the parallel region. CodeExtractor generates
776 // instructions to unwrap the aggregate argument and may sink
777 // allocas/bitcasts for values that are solely used in the outlined region
778 // and do not escape.
779 assert(!ArtificialEntry.empty() &&
780 "Expected instructions to add in the outlined region entry");
781 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
782 End = ArtificialEntry.rend();
783 It != End;) {
784 Instruction &I = *It;
785 It++;
786
787 if (I.isTerminator()) {
788 // Absorb any debug value that terminator may have
789 if (OI.EntryBB->getTerminator())
790 OI.EntryBB->getTerminator()->adoptDbgRecords(
791 &ArtificialEntry, I.getIterator(), false);
792 continue;
793 }
794
795 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
796 }
797
798 OI.EntryBB->moveBefore(&ArtificialEntry);
799 ArtificialEntry.eraseFromParent();
800 }
801 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
802 assert(OutlinedFn && OutlinedFn->hasNUses(1));
803
804 // Run a user callback, e.g. to add attributes.
805 if (OI.PostOutlineCB)
806 OI.PostOutlineCB(*OutlinedFn);
807 }
808
809 // Remove work items that have been completed.
810 OutlineInfos = std::move(DeferredOutlines);
811
812 // The createTarget functions embeds user written code into
813 // the target region which may inject allocas which need to
814 // be moved to the entry block of our target or risk malformed
815 // optimisations by later passes, this is only relevant for
816 // the device pass which appears to be a little more delicate
817 // when it comes to optimisations (however, we do not block on
818 // that here, it's up to the inserter to the list to do so).
819 // This notbaly has to occur after the OutlinedInfo candidates
820 // have been extracted so we have an end product that will not
821 // be implicitly adversely affected by any raises unless
822 // intentionally appended to the list.
823 // NOTE: This only does so for ConstantData, it could be extended
824 // to ConstantExpr's with further effort, however, they should
825 // largely be folded when they get here. Extending it to runtime
826 // defined/read+writeable allocation sizes would be non-trivial
827 // (need to factor in movement of any stores to variables the
828 // allocation size depends on, as well as the usual loads,
829 // otherwise it'll yield the wrong result after movement) and
830 // likely be more suitable as an LLVM optimisation pass.
833
834 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
835 [](EmitMetadataErrorKind Kind,
836 const TargetRegionEntryInfo &EntryInfo) -> void {
837 errs() << "Error of kind: " << Kind
838 << " when emitting offload entries and metadata during "
839 "OMPIRBuilder finalization \n";
840 };
841
844
845 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
846 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
847 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
848 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
849 }
850
851 IsFinalized = true;
852}
853
854bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
855
857 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
858}
859
862 auto *GV =
863 new GlobalVariable(M, I32Ty,
864 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
865 ConstantInt::get(I32Ty, Value), Name);
866 GV->setVisibility(GlobalValue::HiddenVisibility);
867
868 return GV;
869}
870
872 if (List.empty())
873 return;
874
875 // Convert List to what ConstantArray needs.
877 UsedArray.resize(List.size());
878 for (unsigned I = 0, E = List.size(); I != E; ++I)
880 cast<Constant>(&*List[I]), Builder.getPtrTy());
881
882 if (UsedArray.empty())
883 return;
884 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
885
886 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
887 ConstantArray::get(ATy, UsedArray), Name);
888
889 GV->setSection("llvm.metadata");
890}
891
894 OMPTgtExecModeFlags Mode) {
895 auto *Int8Ty = Builder.getInt8Ty();
896 auto *GVMode = new GlobalVariable(
897 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
898 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
899 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
900 return GVMode;
901}
902
904 uint32_t SrcLocStrSize,
905 IdentFlag LocFlags,
906 unsigned Reserve2Flags) {
907 // Enable "C-mode".
908 LocFlags |= OMP_IDENT_FLAG_KMPC;
909
910 Constant *&Ident =
911 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
912 if (!Ident) {
914 Constant *IdentData[] = {I32Null,
915 ConstantInt::get(Int32, uint32_t(LocFlags)),
916 ConstantInt::get(Int32, Reserve2Flags),
917 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
918 Constant *Initializer =
919 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
920
921 // Look for existing encoding of the location + flags, not needed but
922 // minimizes the difference to the existing solution while we transition.
923 for (GlobalVariable &GV : M.globals())
924 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
925 if (GV.getInitializer() == Initializer)
926 Ident = &GV;
927
928 if (!Ident) {
929 auto *GV = new GlobalVariable(
930 M, OpenMPIRBuilder::Ident,
931 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
934 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
935 GV->setAlignment(Align(8));
936 Ident = GV;
937 }
938 }
939
941}
942
944 uint32_t &SrcLocStrSize) {
945 SrcLocStrSize = LocStr.size();
946 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
947 if (!SrcLocStr) {
948 Constant *Initializer =
950
951 // Look for existing encoding of the location, not needed but minimizes the
952 // difference to the existing solution while we transition.
953 for (GlobalVariable &GV : M.globals())
954 if (GV.isConstant() && GV.hasInitializer() &&
955 GV.getInitializer() == Initializer)
956 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
957
958 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
959 /* AddressSpace */ 0, &M);
960 }
961 return SrcLocStr;
962}
963
965 StringRef FileName,
966 unsigned Line, unsigned Column,
967 uint32_t &SrcLocStrSize) {
968 SmallString<128> Buffer;
969 Buffer.push_back(';');
970 Buffer.append(FileName);
971 Buffer.push_back(';');
972 Buffer.append(FunctionName);
973 Buffer.push_back(';');
974 Buffer.append(std::to_string(Line));
975 Buffer.push_back(';');
976 Buffer.append(std::to_string(Column));
977 Buffer.push_back(';');
978 Buffer.push_back(';');
979 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
980}
981
982Constant *
984 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
985 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
986}
987
989 uint32_t &SrcLocStrSize,
990 Function *F) {
991 DILocation *DIL = DL.get();
992 if (!DIL)
993 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
994 StringRef FileName = M.getName();
995 if (DIFile *DIF = DIL->getFile())
996 if (std::optional<StringRef> Source = DIF->getSource())
997 FileName = *Source;
998 StringRef Function = DIL->getScope()->getSubprogram()->getName();
999 if (Function.empty() && F)
1000 Function = F->getName();
1001 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1002 DIL->getColumn(), SrcLocStrSize);
1003}
1004
1006 uint32_t &SrcLocStrSize) {
1007 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1008 Loc.IP.getBlock()->getParent());
1009}
1010
1012 return Builder.CreateCall(
1013 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1014 "omp_global_thread_num");
1015}
1016
1019 bool ForceSimpleCall, bool CheckCancelFlag) {
1020 if (!updateToLocation(Loc))
1021 return Loc.IP;
1022
1023 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1024 // __kmpc_barrier(loc, thread_id);
1025
1026 IdentFlag BarrierLocFlags;
1027 switch (Kind) {
1028 case OMPD_for:
1029 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1030 break;
1031 case OMPD_sections:
1032 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1033 break;
1034 case OMPD_single:
1035 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1036 break;
1037 case OMPD_barrier:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1039 break;
1040 default:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1042 break;
1043 }
1044
1045 uint32_t SrcLocStrSize;
1046 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1047 Value *Args[] = {
1048 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1049 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1050
1051 // If we are in a cancellable parallel region, barriers are cancellation
1052 // points.
1053 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1054 bool UseCancelBarrier =
1055 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1056
1057 Value *Result =
1059 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1060 : OMPRTL___kmpc_barrier),
1061 Args);
1062
1063 if (UseCancelBarrier && CheckCancelFlag)
1064 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1065 return Err;
1066
1067 return Builder.saveIP();
1068}
1069
1072 Value *IfCondition,
1073 omp::Directive CanceledDirective) {
1074 if (!updateToLocation(Loc))
1075 return Loc.IP;
1076
1077 // LLVM utilities like blocks with terminators.
1078 auto *UI = Builder.CreateUnreachable();
1079
1080 Instruction *ThenTI = UI, *ElseTI = nullptr;
1081 if (IfCondition)
1082 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1083 Builder.SetInsertPoint(ThenTI);
1084
1085 Value *CancelKind = nullptr;
1086 switch (CanceledDirective) {
1087#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1088 case DirectiveEnum: \
1089 CancelKind = Builder.getInt32(Value); \
1090 break;
1091#include "llvm/Frontend/OpenMP/OMPKinds.def"
1092 default:
1093 llvm_unreachable("Unknown cancel kind!");
1094 }
1095
1096 uint32_t SrcLocStrSize;
1097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1099 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1100 Value *Result = Builder.CreateCall(
1101 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1102 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1103 if (CanceledDirective == OMPD_parallel) {
1105 Builder.restoreIP(IP);
1107 omp::Directive::OMPD_unknown,
1108 /* ForceSimpleCall */ false,
1109 /* CheckCancelFlag */ false)
1110 .takeError();
1111 }
1112 return Error::success();
1113 };
1114
1115 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1116 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1117 return Err;
1118
1119 // Update the insertion point and remove the terminator we introduced.
1120 Builder.SetInsertPoint(UI->getParent());
1121 UI->eraseFromParent();
1122
1123 return Builder.saveIP();
1124}
1125
1128 omp::Directive CanceledDirective) {
1129 if (!updateToLocation(Loc))
1130 return Loc.IP;
1131
1132 // LLVM utilities like blocks with terminators.
1133 auto *UI = Builder.CreateUnreachable();
1135
1136 Value *CancelKind = nullptr;
1137 switch (CanceledDirective) {
1138#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1139 case DirectiveEnum: \
1140 CancelKind = Builder.getInt32(Value); \
1141 break;
1142#include "llvm/Frontend/OpenMP/OMPKinds.def"
1143 default:
1144 llvm_unreachable("Unknown cancel kind!");
1145 }
1146
1147 uint32_t SrcLocStrSize;
1148 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1149 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1150 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1151 Value *Result = Builder.CreateCall(
1152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1153 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1154 if (CanceledDirective == OMPD_parallel) {
1156 Builder.restoreIP(IP);
1158 omp::Directive::OMPD_unknown,
1159 /* ForceSimpleCall */ false,
1160 /* CheckCancelFlag */ false)
1161 .takeError();
1162 }
1163 return Error::success();
1164 };
1165
1166 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1167 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1168 return Err;
1169
1170 // Update the insertion point and remove the terminator we introduced.
1171 Builder.SetInsertPoint(UI->getParent());
1172 UI->eraseFromParent();
1173
1174 return Builder.saveIP();
1175}
1176
1178 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1179 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1180 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1181 if (!updateToLocation(Loc))
1182 return Loc.IP;
1183
1184 Builder.restoreIP(AllocaIP);
1185 auto *KernelArgsPtr =
1186 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1187 updateToLocation(Loc);
1188
1189 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1190 llvm::Value *Arg =
1191 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1193 KernelArgs[I], Arg,
1194 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1195 }
1196
1197 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1198 NumThreads, HostPtr, KernelArgsPtr};
1199
1200 Return = Builder.CreateCall(
1201 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1202 OffloadingArgs);
1203
1204 return Builder.saveIP();
1205}
1206
1208 const LocationDescription &Loc, Value *OutlinedFnID,
1209 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1210 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1211
1212 if (!updateToLocation(Loc))
1213 return Loc.IP;
1214
1215 // On top of the arrays that were filled up, the target offloading call
1216 // takes as arguments the device id as well as the host pointer. The host
1217 // pointer is used by the runtime library to identify the current target
1218 // region, so it only has to be unique and not necessarily point to
1219 // anything. It could be the pointer to the outlined function that
1220 // implements the target region, but we aren't using that so that the
1221 // compiler doesn't need to keep that, and could therefore inline the host
1222 // function if proven worthwhile during optimization.
1223
1224 // From this point on, we need to have an ID of the target region defined.
1225 assert(OutlinedFnID && "Invalid outlined function ID!");
1226 (void)OutlinedFnID;
1227
1228 // Return value of the runtime offloading call.
1229 Value *Return = nullptr;
1230
1231 // Arguments for the target kernel.
1232 SmallVector<Value *> ArgsVector;
1233 getKernelArgsVector(Args, Builder, ArgsVector);
1234
1235 // The target region is an outlined function launched by the runtime
1236 // via calls to __tgt_target_kernel().
1237 //
1238 // Note that on the host and CPU targets, the runtime implementation of
1239 // these calls simply call the outlined function without forking threads.
1240 // The outlined functions themselves have runtime calls to
1241 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1242 // the compiler in emitTeamsCall() and emitParallelCall().
1243 //
1244 // In contrast, on the NVPTX target, the implementation of
1245 // __tgt_target_teams() launches a GPU kernel with the requested number
1246 // of teams and threads so no additional calls to the runtime are required.
1247 // Check the error code and execute the host version if required.
1249 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1250 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1251
1252 BasicBlock *OffloadFailedBlock =
1253 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1254 BasicBlock *OffloadContBlock =
1255 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1257 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1258
1259 auto CurFn = Builder.GetInsertBlock()->getParent();
1260 emitBlock(OffloadFailedBlock, CurFn);
1261 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1262 if (!AfterIP)
1263 return AfterIP.takeError();
1264 Builder.restoreIP(*AfterIP);
1265 emitBranch(OffloadContBlock);
1266 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1267 return Builder.saveIP();
1268}
1269
1271 Value *CancelFlag, omp::Directive CanceledDirective,
1272 FinalizeCallbackTy ExitCB) {
1273 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1274 "Unexpected cancellation!");
1275
1276 // For a cancel barrier we create two new blocks.
1278 BasicBlock *NonCancellationBlock;
1279 if (Builder.GetInsertPoint() == BB->end()) {
1280 // TODO: This branch will not be needed once we moved to the
1281 // OpenMPIRBuilder codegen completely.
1282 NonCancellationBlock = BasicBlock::Create(
1283 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1284 } else {
1285 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1288 }
1289 BasicBlock *CancellationBlock = BasicBlock::Create(
1290 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1291
1292 // Jump to them based on the return value.
1293 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1294 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1295 /* TODO weight */ nullptr, nullptr);
1296
1297 // From the cancellation block we finalize all variables and go to the
1298 // post finalization block that is known to the FiniCB callback.
1299 Builder.SetInsertPoint(CancellationBlock);
1300 if (ExitCB)
1301 if (Error Err = ExitCB(Builder.saveIP()))
1302 return Err;
1303 auto &FI = FinalizationStack.back();
1304 if (Error Err = FI.FiniCB(Builder.saveIP()))
1305 return Err;
1306
1307 // The continuation block is where code generation continues.
1308 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1309 return Error::success();
1310}
1311
1312// Callback used to create OpenMP runtime calls to support
1313// omp parallel clause for the device.
1314// We need to use this callback to replace call to the OutlinedFn in OuterFn
1315// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1317 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1318 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1319 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1320 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1321 // Add some known attributes.
1322 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1323 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1324 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1325 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1326 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1327 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1328
1329 assert(OutlinedFn.arg_size() >= 2 &&
1330 "Expected at least tid and bounded tid as arguments");
1331 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1332
1333 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1334 assert(CI && "Expected call instruction to outlined function");
1335 CI->getParent()->setName("omp_parallel");
1336
1337 Builder.SetInsertPoint(CI);
1338 Type *PtrTy = OMPIRBuilder->VoidPtr;
1339 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1340
1341 // Add alloca for kernel args
1342 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1343 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1344 AllocaInst *ArgsAlloca =
1345 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1346 Value *Args = ArgsAlloca;
1347 // Add address space cast if array for storing arguments is not allocated
1348 // in address space 0
1349 if (ArgsAlloca->getAddressSpace())
1350 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1351 Builder.restoreIP(CurrentIP);
1352
1353 // Store captured vars which are used by kmpc_parallel_51
1354 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1355 Value *V = *(CI->arg_begin() + 2 + Idx);
1356 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1357 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1358 Builder.CreateStore(V, StoreAddress);
1359 }
1360
1361 Value *Cond =
1362 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1363 : Builder.getInt32(1);
1364
1365 // Build kmpc_parallel_51 call
1366 Value *Parallel51CallArgs[] = {
1367 /* identifier*/ Ident,
1368 /* global thread num*/ ThreadID,
1369 /* if expression */ Cond,
1370 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1371 /* Proc bind */ Builder.getInt32(-1),
1372 /* outlined function */ &OutlinedFn,
1373 /* wrapper function */ NullPtrValue,
1374 /* arguments of the outlined funciton*/ Args,
1375 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1376
1377 FunctionCallee RTLFn =
1378 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1379
1380 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1381
1382 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1383 << *Builder.GetInsertBlock()->getParent() << "\n");
1384
1385 // Initialize the local TID stack location with the argument value.
1386 Builder.SetInsertPoint(PrivTID);
1387 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1388 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1389 PrivTIDAddr);
1390
1391 // Remove redundant call to the outlined function.
1392 CI->eraseFromParent();
1393
1394 for (Instruction *I : ToBeDeleted) {
1395 I->eraseFromParent();
1396 }
1397}
1398
1399// Callback used to create OpenMP runtime calls to support
1400// omp parallel clause for the host.
1401// We need to use this callback to replace call to the OutlinedFn in OuterFn
1402// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1403static void
1405 Function *OuterFn, Value *Ident, Value *IfCondition,
1406 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1407 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1408 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1409 FunctionCallee RTLFn;
1410 if (IfCondition) {
1411 RTLFn =
1412 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1413 } else {
1414 RTLFn =
1415 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1416 }
1417 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1418 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1419 LLVMContext &Ctx = F->getContext();
1420 MDBuilder MDB(Ctx);
1421 // Annotate the callback behavior of the __kmpc_fork_call:
1422 // - The callback callee is argument number 2 (microtask).
1423 // - The first two arguments of the callback callee are unknown (-1).
1424 // - All variadic arguments to the __kmpc_fork_call are passed to the
1425 // callback callee.
1426 F->addMetadata(LLVMContext::MD_callback,
1428 2, {-1, -1},
1429 /* VarArgsArePassed */ true)}));
1430 }
1431 }
1432 // Add some known attributes.
1433 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1434 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1435 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1436
1437 assert(OutlinedFn.arg_size() >= 2 &&
1438 "Expected at least tid and bounded tid as arguments");
1439 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1440
1441 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1442 CI->getParent()->setName("omp_parallel");
1443 Builder.SetInsertPoint(CI);
1444
1445 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1446 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1447 &OutlinedFn};
1448
1449 SmallVector<Value *, 16> RealArgs;
1450 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1451 if (IfCondition) {
1452 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1453 RealArgs.push_back(Cond);
1454 }
1455 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1456
1457 // __kmpc_fork_call_if always expects a void ptr as the last argument
1458 // If there are no arguments, pass a null pointer.
1459 auto PtrTy = OMPIRBuilder->VoidPtr;
1460 if (IfCondition && NumCapturedVars == 0) {
1461 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1462 RealArgs.push_back(NullPtrValue);
1463 }
1464
1465 Builder.CreateCall(RTLFn, RealArgs);
1466
1467 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1468 << *Builder.GetInsertBlock()->getParent() << "\n");
1469
1470 // Initialize the local TID stack location with the argument value.
1471 Builder.SetInsertPoint(PrivTID);
1472 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1473 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1474 PrivTIDAddr);
1475
1476 // Remove redundant call to the outlined function.
1477 CI->eraseFromParent();
1478
1479 for (Instruction *I : ToBeDeleted) {
1480 I->eraseFromParent();
1481 }
1482}
1483
1485 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1486 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1487 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1488 omp::ProcBindKind ProcBind, bool IsCancellable) {
1489 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1490
1491 if (!updateToLocation(Loc))
1492 return Loc.IP;
1493
1494 uint32_t SrcLocStrSize;
1495 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1496 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1497 Value *ThreadID = getOrCreateThreadID(Ident);
1498 // If we generate code for the target device, we need to allocate
1499 // struct for aggregate params in the device default alloca address space.
1500 // OpenMP runtime requires that the params of the extracted functions are
1501 // passed as zero address space pointers. This flag ensures that extracted
1502 // function arguments are declared in zero address space
1503 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1504
1505 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1506 // only if we compile for host side.
1507 if (NumThreads && !Config.isTargetDevice()) {
1508 Value *Args[] = {
1509 Ident, ThreadID,
1510 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1512 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1513 }
1514
1515 if (ProcBind != OMP_PROC_BIND_default) {
1516 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1522 }
1523
1524 BasicBlock *InsertBB = Builder.GetInsertBlock();
1525 Function *OuterFn = InsertBB->getParent();
1526
1527 // Save the outer alloca block because the insertion iterator may get
1528 // invalidated and we still need this later.
1529 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1530
1531 // Vector to remember instructions we used only during the modeling but which
1532 // we want to delete at the end.
1534
1535 // Change the location to the outer alloca insertion point to create and
1536 // initialize the allocas we pass into the parallel region.
1537 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1538 Builder.restoreIP(NewOuter);
1539 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1540 AllocaInst *ZeroAddrAlloca =
1541 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1542 Instruction *TIDAddr = TIDAddrAlloca;
1543 Instruction *ZeroAddr = ZeroAddrAlloca;
1544 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1545 // Add additional casts to enforce pointers in zero address space
1546 TIDAddr = new AddrSpaceCastInst(
1547 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1548 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1549 ToBeDeleted.push_back(TIDAddr);
1550 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1551 PointerType ::get(M.getContext(), 0),
1552 "zero.addr.ascast");
1553 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1554 ToBeDeleted.push_back(ZeroAddr);
1555 }
1556
1557 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1558 // associated arguments in the outlined function, so we delete them later.
1559 ToBeDeleted.push_back(TIDAddrAlloca);
1560 ToBeDeleted.push_back(ZeroAddrAlloca);
1561
1562 // Create an artificial insertion point that will also ensure the blocks we
1563 // are about to split are not degenerated.
1564 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1565
1566 BasicBlock *EntryBB = UI->getParent();
1567 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1568 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1569 BasicBlock *PRegPreFiniBB =
1570 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1571 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1572
1573 auto FiniCBWrapper = [&](InsertPointTy IP) {
1574 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1575 // target to the region exit block.
1576 if (IP.getBlock()->end() == IP.getPoint()) {
1578 Builder.restoreIP(IP);
1579 Instruction *I = Builder.CreateBr(PRegExitBB);
1580 IP = InsertPointTy(I->getParent(), I->getIterator());
1581 }
1582 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1583 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1584 "Unexpected insertion point for finalization call!");
1585 return FiniCB(IP);
1586 };
1587
1588 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1589
1590 // Generate the privatization allocas in the block that will become the entry
1591 // of the outlined function.
1592 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1593 InsertPointTy InnerAllocaIP = Builder.saveIP();
1594
1595 AllocaInst *PrivTIDAddr =
1596 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1597 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1598
1599 // Add some fake uses for OpenMP provided arguments.
1600 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1601 Instruction *ZeroAddrUse =
1602 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1603 ToBeDeleted.push_back(ZeroAddrUse);
1604
1605 // EntryBB
1606 // |
1607 // V
1608 // PRegionEntryBB <- Privatization allocas are placed here.
1609 // |
1610 // V
1611 // PRegionBodyBB <- BodeGen is invoked here.
1612 // |
1613 // V
1614 // PRegPreFiniBB <- The block we will start finalization from.
1615 // |
1616 // V
1617 // PRegionExitBB <- A common exit to simplify block collection.
1618 //
1619
1620 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1621
1622 // Let the caller create the body.
1623 assert(BodyGenCB && "Expected body generation callback!");
1624 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1625 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1626 return Err;
1627
1628 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1629
1630 OutlineInfo OI;
1631 if (Config.isTargetDevice()) {
1632 // Generate OpenMP target specific runtime call
1633 OI.PostOutlineCB = [=, ToBeDeletedVec =
1634 std::move(ToBeDeleted)](Function &OutlinedFn) {
1635 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1636 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1637 ThreadID, ToBeDeletedVec);
1638 };
1639 } else {
1640 // Generate OpenMP host runtime call
1641 OI.PostOutlineCB = [=, ToBeDeletedVec =
1642 std::move(ToBeDeleted)](Function &OutlinedFn) {
1643 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1644 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1645 };
1646 }
1647
1648 OI.OuterAllocaBB = OuterAllocaBlock;
1649 OI.EntryBB = PRegEntryBB;
1650 OI.ExitBB = PRegExitBB;
1651
1652 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1654 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1655
1656 CodeExtractorAnalysisCache CEAC(*OuterFn);
1657 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1658 /* AggregateArgs */ false,
1659 /* BlockFrequencyInfo */ nullptr,
1660 /* BranchProbabilityInfo */ nullptr,
1661 /* AssumptionCache */ nullptr,
1662 /* AllowVarArgs */ true,
1663 /* AllowAlloca */ true,
1664 /* AllocationBlock */ OuterAllocaBlock,
1665 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1666
1667 // Find inputs to, outputs from the code region.
1668 BasicBlock *CommonExit = nullptr;
1669 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1670 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1671
1672 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1673 /*CollectGlobalInputs=*/true);
1674
1675 Inputs.remove_if([&](Value *I) {
1676 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1677 return GV->getValueType() == OpenMPIRBuilder::Ident;
1678
1679 return false;
1680 });
1681
1682 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1683
1684 FunctionCallee TIDRTLFn =
1685 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1686
1687 auto PrivHelper = [&](Value &V) -> Error {
1688 if (&V == TIDAddr || &V == ZeroAddr) {
1689 OI.ExcludeArgsFromAggregate.push_back(&V);
1690 return Error::success();
1691 }
1692
1694 for (Use &U : V.uses())
1695 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1696 if (ParallelRegionBlockSet.count(UserI->getParent()))
1697 Uses.insert(&U);
1698
1699 // __kmpc_fork_call expects extra arguments as pointers. If the input
1700 // already has a pointer type, everything is fine. Otherwise, store the
1701 // value onto stack and load it back inside the to-be-outlined region. This
1702 // will ensure only the pointer will be passed to the function.
1703 // FIXME: if there are more than 15 trailing arguments, they must be
1704 // additionally packed in a struct.
1705 Value *Inner = &V;
1706 if (!V.getType()->isPointerTy()) {
1708 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1709
1710 Builder.restoreIP(OuterAllocaIP);
1711 Value *Ptr =
1712 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1713
1714 // Store to stack at end of the block that currently branches to the entry
1715 // block of the to-be-outlined region.
1716 Builder.SetInsertPoint(InsertBB,
1717 InsertBB->getTerminator()->getIterator());
1718 Builder.CreateStore(&V, Ptr);
1719
1720 // Load back next to allocations in the to-be-outlined region.
1721 Builder.restoreIP(InnerAllocaIP);
1722 Inner = Builder.CreateLoad(V.getType(), Ptr);
1723 }
1724
1725 Value *ReplacementValue = nullptr;
1726 CallInst *CI = dyn_cast<CallInst>(&V);
1727 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1728 ReplacementValue = PrivTID;
1729 } else {
1730 InsertPointOrErrorTy AfterIP =
1731 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1732 if (!AfterIP)
1733 return AfterIP.takeError();
1734 Builder.restoreIP(*AfterIP);
1735 InnerAllocaIP = {
1736 InnerAllocaIP.getBlock(),
1737 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1738
1739 assert(ReplacementValue &&
1740 "Expected copy/create callback to set replacement value!");
1741 if (ReplacementValue == &V)
1742 return Error::success();
1743 }
1744
1745 for (Use *UPtr : Uses)
1746 UPtr->set(ReplacementValue);
1747
1748 return Error::success();
1749 };
1750
1751 // Reset the inner alloca insertion as it will be used for loading the values
1752 // wrapped into pointers before passing them into the to-be-outlined region.
1753 // Configure it to insert immediately after the fake use of zero address so
1754 // that they are available in the generated body and so that the
1755 // OpenMP-related values (thread ID and zero address pointers) remain leading
1756 // in the argument list.
1757 InnerAllocaIP = IRBuilder<>::InsertPoint(
1758 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1759
1760 // Reset the outer alloca insertion point to the entry of the relevant block
1761 // in case it was invalidated.
1762 OuterAllocaIP = IRBuilder<>::InsertPoint(
1763 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1764
1765 for (Value *Input : Inputs) {
1766 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1767 if (Error Err = PrivHelper(*Input))
1768 return Err;
1769 }
1770 LLVM_DEBUG({
1771 for (Value *Output : Outputs)
1772 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1773 });
1774 assert(Outputs.empty() &&
1775 "OpenMP outlining should not produce live-out values!");
1776
1777 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1778 LLVM_DEBUG({
1779 for (auto *BB : Blocks)
1780 dbgs() << " PBR: " << BB->getName() << "\n";
1781 });
1782
1783 // Adjust the finalization stack, verify the adjustment, and call the
1784 // finalize function a last time to finalize values between the pre-fini
1785 // block and the exit block if we left the parallel "the normal way".
1786 auto FiniInfo = FinalizationStack.pop_back_val();
1787 (void)FiniInfo;
1788 assert(FiniInfo.DK == OMPD_parallel &&
1789 "Unexpected finalization stack state!");
1790
1791 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1792
1793 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1794 if (Error Err = FiniCB(PreFiniIP))
1795 return Err;
1796
1797 // Register the outlined info.
1798 addOutlineInfo(std::move(OI));
1799
1800 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1801 UI->eraseFromParent();
1802
1803 return AfterIP;
1804}
1805
1807 // Build call void __kmpc_flush(ident_t *loc)
1808 uint32_t SrcLocStrSize;
1809 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1810 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1811
1812 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1813}
1814
1816 if (!updateToLocation(Loc))
1817 return;
1818 emitFlush(Loc);
1819}
1820
1822 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1823 // global_tid);
1824 uint32_t SrcLocStrSize;
1825 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1826 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1827 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1828
1829 // Ignore return result until untied tasks are supported.
1830 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1831 Args);
1832}
1833
1835 if (!updateToLocation(Loc))
1836 return;
1837 emitTaskwaitImpl(Loc);
1838}
1839
1841 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1842 uint32_t SrcLocStrSize;
1843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1846 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1847
1848 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1849 Args);
1850}
1851
1853 if (!updateToLocation(Loc))
1854 return;
1855 emitTaskyieldImpl(Loc);
1856}
1857
1858// Processes the dependencies in Dependencies and does the following
1859// - Allocates space on the stack of an array of DependInfo objects
1860// - Populates each DependInfo object with relevant information of
1861// the corresponding dependence.
1862// - All code is inserted in the entry block of the current function.
1864 OpenMPIRBuilder &OMPBuilder,
1866 // Early return if we have no dependencies to process
1867 if (Dependencies.empty())
1868 return nullptr;
1869
1870 // Given a vector of DependData objects, in this function we create an
1871 // array on the stack that holds kmp_dep_info objects corresponding
1872 // to each dependency. This is then passed to the OpenMP runtime.
1873 // For example, if there are 'n' dependencies then the following psedo
1874 // code is generated. Assume the first dependence is on a variable 'a'
1875 //
1876 // \code{c}
1877 // DepArray = alloc(n x sizeof(kmp_depend_info);
1878 // idx = 0;
1879 // DepArray[idx].base_addr = ptrtoint(&a);
1880 // DepArray[idx].len = 8;
1881 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1882 // ++idx;
1883 // DepArray[idx].base_addr = ...;
1884 // \endcode
1885
1886 IRBuilderBase &Builder = OMPBuilder.Builder;
1887 Type *DependInfo = OMPBuilder.DependInfo;
1888 Module &M = OMPBuilder.M;
1889
1890 Value *DepArray = nullptr;
1891 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1892 Builder.SetInsertPoint(
1894
1895 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1896 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1897
1898 Builder.restoreIP(OldIP);
1899
1900 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1901 Value *Base =
1902 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1903 // Store the pointer to the variable
1904 Value *Addr = Builder.CreateStructGEP(
1905 DependInfo, Base,
1906 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1907 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1908 Builder.CreateStore(DepValPtr, Addr);
1909 // Store the size of the variable
1910 Value *Size = Builder.CreateStructGEP(
1911 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1912 Builder.CreateStore(
1913 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1914 Size);
1915 // Store the dependency kind
1916 Value *Flags = Builder.CreateStructGEP(
1917 DependInfo, Base,
1918 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1919 Builder.CreateStore(
1920 ConstantInt::get(Builder.getInt8Ty(),
1921 static_cast<unsigned int>(Dep.DepKind)),
1922 Flags);
1923 }
1924 return DepArray;
1925}
1926
1928 const LocationDescription &Loc, InsertPointTy AllocaIP,
1929 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1930 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1931 Value *Priority) {
1932
1933 if (!updateToLocation(Loc))
1934 return InsertPointTy();
1935
1936 uint32_t SrcLocStrSize;
1937 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1938 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1939 // The current basic block is split into four basic blocks. After outlining,
1940 // they will be mapped as follows:
1941 // ```
1942 // def current_fn() {
1943 // current_basic_block:
1944 // br label %task.exit
1945 // task.exit:
1946 // ; instructions after task
1947 // }
1948 // def outlined_fn() {
1949 // task.alloca:
1950 // br label %task.body
1951 // task.body:
1952 // ret void
1953 // }
1954 // ```
1955 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1956 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1957 BasicBlock *TaskAllocaBB =
1958 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1959
1960 InsertPointTy TaskAllocaIP =
1961 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1962 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1963 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1964 return Err;
1965
1966 OutlineInfo OI;
1967 OI.EntryBB = TaskAllocaBB;
1968 OI.OuterAllocaBB = AllocaIP.getBlock();
1969 OI.ExitBB = TaskExitBB;
1970
1971 // Add the thread ID argument.
1974 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1975
1976 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1977 Mergeable, Priority, EventHandle, TaskAllocaBB,
1978 ToBeDeleted](Function &OutlinedFn) mutable {
1979 // Replace the Stale CI by appropriate RTL function call.
1980 assert(OutlinedFn.hasOneUse() &&
1981 "there must be a single user for the outlined function");
1982 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1983
1984 // HasShareds is true if any variables are captured in the outlined region,
1985 // false otherwise.
1986 bool HasShareds = StaleCI->arg_size() > 1;
1987 Builder.SetInsertPoint(StaleCI);
1988
1989 // Gather the arguments for emitting the runtime call for
1990 // @__kmpc_omp_task_alloc
1991 Function *TaskAllocFn =
1992 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1993
1994 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1995 // call.
1996 Value *ThreadID = getOrCreateThreadID(Ident);
1997
1998 // Argument - `flags`
1999 // Task is tied iff (Flags & 1) == 1.
2000 // Task is untied iff (Flags & 1) == 0.
2001 // Task is final iff (Flags & 2) == 2.
2002 // Task is not final iff (Flags & 2) == 0.
2003 // Task is mergeable iff (Flags & 4) == 4.
2004 // Task is not mergeable iff (Flags & 4) == 0.
2005 // Task is priority iff (Flags & 32) == 32.
2006 // Task is not priority iff (Flags & 32) == 0.
2007 // TODO: Handle the other flags.
2008 Value *Flags = Builder.getInt32(Tied);
2009 if (Final) {
2010 Value *FinalFlag =
2012 Flags = Builder.CreateOr(FinalFlag, Flags);
2013 }
2014
2015 if (Mergeable)
2016 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2017 if (Priority)
2018 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2019
2020 // Argument - `sizeof_kmp_task_t` (TaskSize)
2021 // Tasksize refers to the size in bytes of kmp_task_t data structure
2022 // including private vars accessed in task.
2023 // TODO: add kmp_task_t_with_privates (privates)
2024 Value *TaskSize = Builder.getInt64(
2026
2027 // Argument - `sizeof_shareds` (SharedsSize)
2028 // SharedsSize refers to the shareds array size in the kmp_task_t data
2029 // structure.
2030 Value *SharedsSize = Builder.getInt64(0);
2031 if (HasShareds) {
2032 AllocaInst *ArgStructAlloca =
2033 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
2034 assert(ArgStructAlloca &&
2035 "Unable to find the alloca instruction corresponding to arguments "
2036 "for extracted function");
2037 StructType *ArgStructType =
2038 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2039 assert(ArgStructType && "Unable to find struct type corresponding to "
2040 "arguments for extracted function");
2041 SharedsSize =
2043 }
2044 // Emit the @__kmpc_omp_task_alloc runtime call
2045 // The runtime call returns a pointer to an area where the task captured
2046 // variables must be copied before the task is run (TaskData)
2047 CallInst *TaskData = Builder.CreateCall(
2048 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2049 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2050 /*task_func=*/&OutlinedFn});
2051
2052 // Emit detach clause initialization.
2053 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2054 // task_descriptor);
2055 if (EventHandle) {
2057 OMPRTL___kmpc_task_allow_completion_event);
2058 llvm::Value *EventVal =
2059 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2060 llvm::Value *EventHandleAddr =
2062 Builder.getPtrTy(0));
2063 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2064 Builder.CreateStore(EventVal, EventHandleAddr);
2065 }
2066 // Copy the arguments for outlined function
2067 if (HasShareds) {
2068 Value *Shareds = StaleCI->getArgOperand(1);
2069 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2070 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2071 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2072 SharedsSize);
2073 }
2074
2075 if (Priority) {
2076 //
2077 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2078 // we populate the priority information into the "kmp_task_t" here
2079 //
2080 // The struct "kmp_task_t" definition is available in kmp.h
2081 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2082 // data2 is used for priority
2083 //
2084 Type *Int32Ty = Builder.getInt32Ty();
2085 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2086 // kmp_task_t* => { ptr }
2087 Type *TaskPtr = StructType::get(VoidPtr);
2088 Value *TaskGEP =
2089 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2090 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2091 Type *TaskStructType = StructType::get(
2092 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2093 Value *PriorityData = Builder.CreateInBoundsGEP(
2094 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2095 // kmp_cmplrdata_t => { ptr, ptr }
2096 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2097 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2098 PriorityData, {Zero, Zero});
2099 Builder.CreateStore(Priority, CmplrData);
2100 }
2101
2102 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2103
2104 // In the presence of the `if` clause, the following IR is generated:
2105 // ...
2106 // %data = call @__kmpc_omp_task_alloc(...)
2107 // br i1 %if_condition, label %then, label %else
2108 // then:
2109 // call @__kmpc_omp_task(...)
2110 // br label %exit
2111 // else:
2112 // ;; Wait for resolution of dependencies, if any, before
2113 // ;; beginning the task
2114 // call @__kmpc_omp_wait_deps(...)
2115 // call @__kmpc_omp_task_begin_if0(...)
2116 // call @outlined_fn(...)
2117 // call @__kmpc_omp_task_complete_if0(...)
2118 // br label %exit
2119 // exit:
2120 // ...
2121 if (IfCondition) {
2122 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2123 // terminator.
2124 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2125 Instruction *IfTerminator =
2126 Builder.GetInsertPoint()->getParent()->getTerminator();
2127 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2128 Builder.SetInsertPoint(IfTerminator);
2129 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2130 &ElseTI);
2131 Builder.SetInsertPoint(ElseTI);
2132
2133 if (Dependencies.size()) {
2134 Function *TaskWaitFn =
2135 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2137 TaskWaitFn,
2138 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2139 ConstantInt::get(Builder.getInt32Ty(), 0),
2141 }
2142 Function *TaskBeginFn =
2143 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2144 Function *TaskCompleteFn =
2145 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2146 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2147 CallInst *CI = nullptr;
2148 if (HasShareds)
2149 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2150 else
2151 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2152 CI->setDebugLoc(StaleCI->getDebugLoc());
2153 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2154 Builder.SetInsertPoint(ThenTI);
2155 }
2156
2157 if (Dependencies.size()) {
2158 Function *TaskFn =
2159 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2161 TaskFn,
2162 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2163 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2165
2166 } else {
2167 // Emit the @__kmpc_omp_task runtime call to spawn the task
2168 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2169 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2170 }
2171
2172 StaleCI->eraseFromParent();
2173
2174 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2175 if (HasShareds) {
2176 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2177 OutlinedFn.getArg(1)->replaceUsesWithIf(
2178 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2179 }
2180
2181 for (Instruction *I : llvm::reverse(ToBeDeleted))
2182 I->eraseFromParent();
2183 };
2184
2185 addOutlineInfo(std::move(OI));
2186 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2187
2188 return Builder.saveIP();
2189}
2190
2193 InsertPointTy AllocaIP,
2194 BodyGenCallbackTy BodyGenCB) {
2195 if (!updateToLocation(Loc))
2196 return InsertPointTy();
2197
2198 uint32_t SrcLocStrSize;
2199 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2200 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2201 Value *ThreadID = getOrCreateThreadID(Ident);
2202
2203 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2204 Function *TaskgroupFn =
2205 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2206 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2207
2208 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2209 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2210 return Err;
2211
2212 Builder.SetInsertPoint(TaskgroupExitBB);
2213 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2214 Function *EndTaskgroupFn =
2215 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2216 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2217
2218 return Builder.saveIP();
2219}
2220
2222 const LocationDescription &Loc, InsertPointTy AllocaIP,
2224 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2225 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2226
2227 if (!updateToLocation(Loc))
2228 return Loc.IP;
2229
2230 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2231 // this has not been created yet at some times when this callback runs.
2232 SmallVector<BranchInst *> CancellationBranches;
2233 auto FiniCBWrapper = [&](InsertPointTy IP) {
2234 if (IP.getBlock()->end() != IP.getPoint())
2235 return FiniCB(IP);
2236 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2237 // will fail because that function requires the Finalization Basic Block to
2238 // have a terminator, which is already removed by EmitOMPRegionBody.
2239 // IP is currently at cancelation block.
2240 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2241 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2242 CancellationBranches.push_back(DummyBranch);
2243 return FiniCB(IP);
2244 };
2245
2246 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2247
2248 // Each section is emitted as a switch case
2249 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2250 // -> OMP.createSection() which generates the IR for each section
2251 // Iterate through all sections and emit a switch construct:
2252 // switch (IV) {
2253 // case 0:
2254 // <SectionStmt[0]>;
2255 // break;
2256 // ...
2257 // case <NumSection> - 1:
2258 // <SectionStmt[<NumSection> - 1]>;
2259 // break;
2260 // }
2261 // ...
2262 // section_loop.after:
2263 // <FiniCB>;
2264 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2265 Builder.restoreIP(CodeGenIP);
2267 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2268 Function *CurFn = Continue->getParent();
2269 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2270
2271 unsigned CaseNumber = 0;
2272 for (auto SectionCB : SectionCBs) {
2274 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2275 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2276 Builder.SetInsertPoint(CaseBB);
2277 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2278 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2279 CaseEndBr->getIterator()}))
2280 return Err;
2281 CaseNumber++;
2282 }
2283 // remove the existing terminator from body BB since there can be no
2284 // terminators after switch/case
2285 return Error::success();
2286 };
2287 // Loop body ends here
2288 // LowerBound, UpperBound, and STride for createCanonicalLoop
2289 Type *I32Ty = Type::getInt32Ty(M.getContext());
2290 Value *LB = ConstantInt::get(I32Ty, 0);
2291 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2292 Value *ST = ConstantInt::get(I32Ty, 1);
2294 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2295 if (!LoopInfo)
2296 return LoopInfo.takeError();
2297
2298 InsertPointOrErrorTy WsloopIP =
2299 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2300 WorksharingLoopType::ForStaticLoop, !IsNowait);
2301 if (!WsloopIP)
2302 return WsloopIP.takeError();
2303 InsertPointTy AfterIP = *WsloopIP;
2304
2305 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2306 assert(LoopFini && "Bad structure of static workshare loop finalization");
2307
2308 // Apply the finalization callback in LoopAfterBB
2309 auto FiniInfo = FinalizationStack.pop_back_val();
2310 assert(FiniInfo.DK == OMPD_sections &&
2311 "Unexpected finalization stack state!");
2312 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2313 Builder.restoreIP(AfterIP);
2314 BasicBlock *FiniBB =
2315 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2316 if (Error Err = CB(Builder.saveIP()))
2317 return Err;
2318 AfterIP = {FiniBB, FiniBB->begin()};
2319 }
2320
2321 // Now we can fix the dummy branch to point to the right place
2322 for (BranchInst *DummyBranch : CancellationBranches) {
2323 assert(DummyBranch->getNumSuccessors() == 1);
2324 DummyBranch->setSuccessor(0, LoopFini);
2325 }
2326
2327 return AfterIP;
2328}
2329
2332 BodyGenCallbackTy BodyGenCB,
2333 FinalizeCallbackTy FiniCB) {
2334 if (!updateToLocation(Loc))
2335 return Loc.IP;
2336
2337 auto FiniCBWrapper = [&](InsertPointTy IP) {
2338 if (IP.getBlock()->end() != IP.getPoint())
2339 return FiniCB(IP);
2340 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2341 // will fail because that function requires the Finalization Basic Block to
2342 // have a terminator, which is already removed by EmitOMPRegionBody.
2343 // IP is currently at cancelation block.
2344 // We need to backtrack to the condition block to fetch
2345 // the exit block and create a branch from cancelation
2346 // to exit block.
2348 Builder.restoreIP(IP);
2349 auto *CaseBB = Loc.IP.getBlock();
2350 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2351 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2352 Instruction *I = Builder.CreateBr(ExitBB);
2353 IP = InsertPointTy(I->getParent(), I->getIterator());
2354 return FiniCB(IP);
2355 };
2356
2357 Directive OMPD = Directive::OMPD_sections;
2358 // Since we are using Finalization Callback here, HasFinalize
2359 // and IsCancellable have to be true
2360 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2361 /*Conditional*/ false, /*hasFinalize*/ true,
2362 /*IsCancellable*/ true);
2363}
2364
2367 IT++;
2368 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2369}
2370
2371Value *OpenMPIRBuilder::getGPUThreadID() {
2372 return Builder.CreateCall(
2374 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2375 {});
2376}
2377
2378Value *OpenMPIRBuilder::getGPUWarpSize() {
2379 return Builder.CreateCall(
2380 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2381}
2382
2383Value *OpenMPIRBuilder::getNVPTXWarpID() {
2384 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2385 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2386}
2387
2388Value *OpenMPIRBuilder::getNVPTXLaneID() {
2389 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2390 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2391 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2392 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2393 "nvptx_lane_id");
2394}
2395
2396Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2397 Type *ToType) {
2398 Type *FromType = From->getType();
2399 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2400 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2401 assert(FromSize > 0 && "From size must be greater than zero");
2402 assert(ToSize > 0 && "To size must be greater than zero");
2403 if (FromType == ToType)
2404 return From;
2405 if (FromSize == ToSize)
2406 return Builder.CreateBitCast(From, ToType);
2407 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2408 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2409 InsertPointTy SaveIP = Builder.saveIP();
2410 Builder.restoreIP(AllocaIP);
2411 Value *CastItem = Builder.CreateAlloca(ToType);
2412 Builder.restoreIP(SaveIP);
2413
2415 CastItem, Builder.getPtrTy(0));
2416 Builder.CreateStore(From, ValCastItem);
2417 return Builder.CreateLoad(ToType, CastItem);
2418}
2419
2420Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2421 Value *Element,
2422 Type *ElementType,
2423 Value *Offset) {
2424 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2425 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2426
2427 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2428 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2429 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2430 Value *WarpSize =
2431 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2433 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2434 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2435 Value *WarpSizeCast =
2436 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2437 Value *ShuffleCall =
2438 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2439 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2440}
2441
2442void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2443 Value *DstAddr, Type *ElemType,
2444 Value *Offset, Type *ReductionArrayTy) {
2446 // Create the loop over the big sized data.
2447 // ptr = (void*)Elem;
2448 // ptrEnd = (void*) Elem + 1;
2449 // Step = 8;
2450 // while (ptr + Step < ptrEnd)
2451 // shuffle((int64_t)*ptr);
2452 // Step = 4;
2453 // while (ptr + Step < ptrEnd)
2454 // shuffle((int32_t)*ptr);
2455 // ...
2456 Type *IndexTy = Builder.getIndexTy(
2458 Value *ElemPtr = DstAddr;
2459 Value *Ptr = SrcAddr;
2460 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2461 if (Size < IntSize)
2462 continue;
2463 Type *IntType = Builder.getIntNTy(IntSize * 8);
2465 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2466 Value *SrcAddrGEP =
2467 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2469 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2470
2471 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2472 if ((Size / IntSize) > 1) {
2474 SrcAddrGEP, Builder.getPtrTy());
2475 BasicBlock *PreCondBB =
2476 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2477 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2478 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2479 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2480 emitBlock(PreCondBB, CurFunc);
2481 PHINode *PhiSrc =
2482 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2483 PhiSrc->addIncoming(Ptr, CurrentBB);
2484 PHINode *PhiDest =
2485 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2486 PhiDest->addIncoming(ElemPtr, CurrentBB);
2487 Ptr = PhiSrc;
2488 ElemPtr = PhiDest;
2489 Value *PtrDiff = Builder.CreatePtrDiff(
2490 Builder.getInt8Ty(), PtrEnd,
2493 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2494 ExitBB);
2495 emitBlock(ThenBB, CurFunc);
2496 Value *Res = createRuntimeShuffleFunction(
2497 AllocaIP,
2499 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2500 IntType, Offset);
2501 Builder.CreateAlignedStore(Res, ElemPtr,
2502 M.getDataLayout().getPrefTypeAlign(ElemType));
2503 Value *LocalPtr =
2504 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2505 Value *LocalElemPtr =
2506 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2507 PhiSrc->addIncoming(LocalPtr, ThenBB);
2508 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2509 emitBranch(PreCondBB);
2510 emitBlock(ExitBB, CurFunc);
2511 } else {
2512 Value *Res = createRuntimeShuffleFunction(
2513 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2514 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2515 Res->getType()->getScalarSizeInBits())
2516 Res = Builder.CreateTrunc(Res, ElemType);
2517 Builder.CreateStore(Res, ElemPtr);
2518 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2519 ElemPtr =
2520 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2521 }
2522 Size = Size % IntSize;
2523 }
2524}
2525
2526void OpenMPIRBuilder::emitReductionListCopy(
2527 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2528 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2529 CopyOptionsTy CopyOptions) {
2530 Type *IndexTy = Builder.getIndexTy(
2532 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2533
2534 // Iterates, element-by-element, through the source Reduce list and
2535 // make a copy.
2536 for (auto En : enumerate(ReductionInfos)) {
2537 const ReductionInfo &RI = En.value();
2538 Value *SrcElementAddr = nullptr;
2539 Value *DestElementAddr = nullptr;
2540 Value *DestElementPtrAddr = nullptr;
2541 // Should we shuffle in an element from a remote lane?
2542 bool ShuffleInElement = false;
2543 // Set to true to update the pointer in the dest Reduce list to a
2544 // newly created element.
2545 bool UpdateDestListPtr = false;
2546
2547 // Step 1.1: Get the address for the src element in the Reduce list.
2548 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2549 ReductionArrayTy, SrcBase,
2550 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2551 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2552
2553 // Step 1.2: Create a temporary to store the element in the destination
2554 // Reduce list.
2555 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2556 ReductionArrayTy, DestBase,
2557 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2558 switch (Action) {
2560 InsertPointTy CurIP = Builder.saveIP();
2561 Builder.restoreIP(AllocaIP);
2562 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2563 ".omp.reduction.element");
2564 DestAlloca->setAlignment(
2565 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2566 DestElementAddr = DestAlloca;
2567 DestElementAddr =
2568 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2569 DestElementAddr->getName() + ".ascast");
2570 Builder.restoreIP(CurIP);
2571 ShuffleInElement = true;
2572 UpdateDestListPtr = true;
2573 break;
2574 }
2576 DestElementAddr =
2577 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2578 break;
2579 }
2580 }
2581
2582 // Now that all active lanes have read the element in the
2583 // Reduce list, shuffle over the value from the remote lane.
2584 if (ShuffleInElement) {
2585 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2586 RemoteLaneOffset, ReductionArrayTy);
2587 } else {
2588 switch (RI.EvaluationKind) {
2589 case EvalKind::Scalar: {
2590 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2591 // Store the source element value to the dest element address.
2592 Builder.CreateStore(Elem, DestElementAddr);
2593 break;
2594 }
2595 case EvalKind::Complex: {
2597 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2598 Value *SrcReal = Builder.CreateLoad(
2599 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2601 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2602 Value *SrcImg = Builder.CreateLoad(
2603 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2604
2606 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2608 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2609 Builder.CreateStore(SrcReal, DestRealPtr);
2610 Builder.CreateStore(SrcImg, DestImgPtr);
2611 break;
2612 }
2613 case EvalKind::Aggregate: {
2614 Value *SizeVal = Builder.getInt64(
2615 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2617 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2618 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2619 SizeVal, false);
2620 break;
2621 }
2622 };
2623 }
2624
2625 // Step 3.1: Modify reference in dest Reduce list as needed.
2626 // Modifying the reference in Reduce list to point to the newly
2627 // created element. The element is live in the current function
2628 // scope and that of functions it invokes (i.e., reduce_function).
2629 // RemoteReduceData[i] = (void*)&RemoteElem
2630 if (UpdateDestListPtr) {
2632 DestElementAddr, Builder.getPtrTy(),
2633 DestElementAddr->getName() + ".ascast");
2634 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2635 }
2636 }
2637}
2638
2639Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2640 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2641 AttributeList FuncAttrs) {
2642 InsertPointTy SavedIP = Builder.saveIP();
2643 LLVMContext &Ctx = M.getContext();
2645 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2646 /* IsVarArg */ false);
2647 Function *WcFunc =
2649 "_omp_reduction_inter_warp_copy_func", &M);
2650 WcFunc->setAttributes(FuncAttrs);
2651 WcFunc->addParamAttr(0, Attribute::NoUndef);
2652 WcFunc->addParamAttr(1, Attribute::NoUndef);
2653 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2654 Builder.SetInsertPoint(EntryBB);
2655
2656 // ReduceList: thread local Reduce list.
2657 // At the stage of the computation when this function is called, partially
2658 // aggregated values reside in the first lane of every active warp.
2659 Argument *ReduceListArg = WcFunc->getArg(0);
2660 // NumWarps: number of warps active in the parallel region. This could
2661 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2662 Argument *NumWarpsArg = WcFunc->getArg(1);
2663
2664 // This array is used as a medium to transfer, one reduce element at a time,
2665 // the data from the first lane of every warp to lanes in the first warp
2666 // in order to perform the final step of a reduction in a parallel region
2667 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2668 // for reduced latency, as well as to have a distinct copy for concurrently
2669 // executing target regions. The array is declared with common linkage so
2670 // as to be shared across compilation units.
2671 StringRef TransferMediumName =
2672 "__openmp_nvptx_data_transfer_temporary_storage";
2673 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2674 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2675 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2676 if (!TransferMedium) {
2677 TransferMedium = new GlobalVariable(
2678 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2679 UndefValue::get(ArrayTy), TransferMediumName,
2680 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2681 /*AddressSpace=*/3);
2682 }
2683
2684 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2685 Value *GPUThreadID = getGPUThreadID();
2686 // nvptx_lane_id = nvptx_id % warpsize
2687 Value *LaneID = getNVPTXLaneID();
2688 // nvptx_warp_id = nvptx_id / warpsize
2689 Value *WarpID = getNVPTXWarpID();
2690
2691 InsertPointTy AllocaIP =
2694 Type *Arg0Type = ReduceListArg->getType();
2695 Type *Arg1Type = NumWarpsArg->getType();
2696 Builder.restoreIP(AllocaIP);
2697 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2698 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2699 AllocaInst *NumWarpsAlloca =
2700 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2702 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2704 NumWarpsAlloca, Builder.getPtrTy(0),
2705 NumWarpsAlloca->getName() + ".ascast");
2706 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2707 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2708 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2709 InsertPointTy CodeGenIP =
2711 Builder.restoreIP(CodeGenIP);
2712
2713 Value *ReduceList =
2714 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2715
2716 for (auto En : enumerate(ReductionInfos)) {
2717 //
2718 // Warp master copies reduce element to transfer medium in __shared__
2719 // memory.
2720 //
2721 const ReductionInfo &RI = En.value();
2722 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2723 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2724 Type *CType = Builder.getIntNTy(TySize * 8);
2725
2726 unsigned NumIters = RealTySize / TySize;
2727 if (NumIters == 0)
2728 continue;
2729 Value *Cnt = nullptr;
2730 Value *CntAddr = nullptr;
2731 BasicBlock *PrecondBB = nullptr;
2732 BasicBlock *ExitBB = nullptr;
2733 if (NumIters > 1) {
2734 CodeGenIP = Builder.saveIP();
2735 Builder.restoreIP(AllocaIP);
2736 CntAddr =
2737 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2738
2739 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2740 CntAddr->getName() + ".ascast");
2741 Builder.restoreIP(CodeGenIP);
2743 CntAddr,
2744 /*Volatile=*/false);
2745 PrecondBB = BasicBlock::Create(Ctx, "precond");
2746 ExitBB = BasicBlock::Create(Ctx, "exit");
2747 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2748 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2749 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2750 /*Volatile=*/false);
2752 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2753 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2755 }
2756
2757 // kmpc_barrier.
2758 InsertPointOrErrorTy BarrierIP1 =
2759 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2760 omp::Directive::OMPD_unknown,
2761 /* ForceSimpleCall */ false,
2762 /* CheckCancelFlag */ true);
2763 if (!BarrierIP1)
2764 return BarrierIP1.takeError();
2765 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2766 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2767 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2768
2769 // if (lane_id == 0)
2770 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2771 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2773
2774 // Reduce element = LocalReduceList[i]
2775 auto *RedListArrayTy =
2776 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2777 Type *IndexTy = Builder.getIndexTy(
2779 Value *ElemPtrPtr =
2780 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2781 {ConstantInt::get(IndexTy, 0),
2782 ConstantInt::get(IndexTy, En.index())});
2783 // elemptr = ((CopyType*)(elemptrptr)) + I
2784 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2785 if (NumIters > 1)
2786 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2787
2788 // Get pointer to location in transfer medium.
2789 // MediumPtr = &medium[warp_id]
2790 Value *MediumPtr = Builder.CreateInBoundsGEP(
2791 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2792 // elem = *elemptr
2793 //*MediumPtr = elem
2794 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2795 // Store the source element value to the dest element address.
2796 Builder.CreateStore(Elem, MediumPtr,
2797 /*IsVolatile*/ true);
2798 Builder.CreateBr(MergeBB);
2799
2800 // else
2802 Builder.CreateBr(MergeBB);
2803
2804 // endif
2806 InsertPointOrErrorTy BarrierIP2 =
2807 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2808 omp::Directive::OMPD_unknown,
2809 /* ForceSimpleCall */ false,
2810 /* CheckCancelFlag */ true);
2811 if (!BarrierIP2)
2812 return BarrierIP2.takeError();
2813
2814 // Warp 0 copies reduce element from transfer medium
2815 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2816 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2817 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2818
2819 Value *NumWarpsVal =
2820 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2821 // Up to 32 threads in warp 0 are active.
2822 Value *IsActiveThread =
2823 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2824 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2825
2826 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2827
2828 // SecMediumPtr = &medium[tid]
2829 // SrcMediumVal = *SrcMediumPtr
2830 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2831 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2832 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2833 Value *TargetElemPtrPtr =
2834 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2835 {ConstantInt::get(IndexTy, 0),
2836 ConstantInt::get(IndexTy, En.index())});
2837 Value *TargetElemPtrVal =
2838 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2839 Value *TargetElemPtr = TargetElemPtrVal;
2840 if (NumIters > 1)
2841 TargetElemPtr =
2842 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2843
2844 // *TargetElemPtr = SrcMediumVal;
2845 Value *SrcMediumValue =
2846 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2847 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2848 Builder.CreateBr(W0MergeBB);
2849
2850 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2851 Builder.CreateBr(W0MergeBB);
2852
2853 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2854
2855 if (NumIters > 1) {
2856 Cnt = Builder.CreateNSWAdd(
2857 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2858 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2859
2860 auto *CurFn = Builder.GetInsertBlock()->getParent();
2861 emitBranch(PrecondBB);
2862 emitBlock(ExitBB, CurFn);
2863 }
2864 RealTySize %= TySize;
2865 }
2866 }
2867
2869 Builder.restoreIP(SavedIP);
2870
2871 return WcFunc;
2872}
2873
2874Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2875 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2876 AttributeList FuncAttrs) {
2877 LLVMContext &Ctx = M.getContext();
2878 FunctionType *FuncTy =
2880 {Builder.getPtrTy(), Builder.getInt16Ty(),
2881 Builder.getInt16Ty(), Builder.getInt16Ty()},
2882 /* IsVarArg */ false);
2883 Function *SarFunc =
2885 "_omp_reduction_shuffle_and_reduce_func", &M);
2886 SarFunc->setAttributes(FuncAttrs);
2887 SarFunc->addParamAttr(0, Attribute::NoUndef);
2888 SarFunc->addParamAttr(1, Attribute::NoUndef);
2889 SarFunc->addParamAttr(2, Attribute::NoUndef);
2890 SarFunc->addParamAttr(3, Attribute::NoUndef);
2891 SarFunc->addParamAttr(1, Attribute::SExt);
2892 SarFunc->addParamAttr(2, Attribute::SExt);
2893 SarFunc->addParamAttr(3, Attribute::SExt);
2894 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2895 Builder.SetInsertPoint(EntryBB);
2896
2897 // Thread local Reduce list used to host the values of data to be reduced.
2898 Argument *ReduceListArg = SarFunc->getArg(0);
2899 // Current lane id; could be logical.
2900 Argument *LaneIDArg = SarFunc->getArg(1);
2901 // Offset of the remote source lane relative to the current lane.
2902 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2903 // Algorithm version. This is expected to be known at compile time.
2904 Argument *AlgoVerArg = SarFunc->getArg(3);
2905
2906 Type *ReduceListArgType = ReduceListArg->getType();
2907 Type *LaneIDArgType = LaneIDArg->getType();
2908 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2909 Value *ReduceListAlloca = Builder.CreateAlloca(
2910 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2911 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2912 LaneIDArg->getName() + ".addr");
2913 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2914 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2915 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2916 AlgoVerArg->getName() + ".addr");
2917 ArrayType *RedListArrayTy =
2918 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2919
2920 // Create a local thread-private variable to host the Reduce list
2921 // from a remote lane.
2922 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2923 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2924
2926 ReduceListAlloca, ReduceListArgType,
2927 ReduceListAlloca->getName() + ".ascast");
2929 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2930 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2931 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2932 RemoteLaneOffsetAlloca->getName() + ".ascast");
2934 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2936 RemoteReductionListAlloca, Builder.getPtrTy(),
2937 RemoteReductionListAlloca->getName() + ".ascast");
2938
2939 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2940 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2941 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2942 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2943
2944 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2945 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2946 Value *RemoteLaneOffset =
2947 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2948 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2949
2950 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2951
2952 // This loop iterates through the list of reduce elements and copies,
2953 // element by element, from a remote lane in the warp to RemoteReduceList,
2954 // hosted on the thread's stack.
2955 emitReductionListCopy(
2956 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2957 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2958
2959 // The actions to be performed on the Remote Reduce list is dependent
2960 // on the algorithm version.
2961 //
2962 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2963 // LaneId % 2 == 0 && Offset > 0):
2964 // do the reduction value aggregation
2965 //
2966 // The thread local variable Reduce list is mutated in place to host the
2967 // reduced data, which is the aggregated value produced from local and
2968 // remote lanes.
2969 //
2970 // Note that AlgoVer is expected to be a constant integer known at compile
2971 // time.
2972 // When AlgoVer==0, the first conjunction evaluates to true, making
2973 // the entire predicate true during compile time.
2974 // When AlgoVer==1, the second conjunction has only the second part to be
2975 // evaluated during runtime. Other conjunctions evaluates to false
2976 // during compile time.
2977 // When AlgoVer==2, the third conjunction has only the second part to be
2978 // evaluated during runtime. Other conjunctions evaluates to false
2979 // during compile time.
2980 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2981 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2982 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2983 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2984 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2985 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2986 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2987 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2988 Value *RemoteOffsetComp =
2989 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2990 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2991 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2992 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2993
2994 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2995 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2996 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2997
2998 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3001 ReduceList, Builder.getPtrTy());
3002 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3003 RemoteListAddrCast, Builder.getPtrTy());
3004 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3005 ->addFnAttr(Attribute::NoUnwind);
3006 Builder.CreateBr(MergeBB);
3007
3009 Builder.CreateBr(MergeBB);
3010
3012
3013 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3014 // Reduce list.
3015 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3016 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3017 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3018
3019 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3020 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3021 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3022 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3023
3024 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3025 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3026 ReductionInfos, RemoteListAddrCast, ReduceList);
3027 Builder.CreateBr(CpyMergeBB);
3028
3029 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3030 Builder.CreateBr(CpyMergeBB);
3031
3032 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3033
3035
3036 return SarFunc;
3037}
3038
3039Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3040 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3041 AttributeList FuncAttrs) {
3043 LLVMContext &Ctx = M.getContext();
3046 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3047 /* IsVarArg */ false);
3048 Function *LtGCFunc =
3050 "_omp_reduction_list_to_global_copy_func", &M);
3051 LtGCFunc->setAttributes(FuncAttrs);
3052 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3053 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3054 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3055
3056 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3057 Builder.SetInsertPoint(EntryBlock);
3058
3059 // Buffer: global reduction buffer.
3060 Argument *BufferArg = LtGCFunc->getArg(0);
3061 // Idx: index of the buffer.
3062 Argument *IdxArg = LtGCFunc->getArg(1);
3063 // ReduceList: thread local Reduce list.
3064 Argument *ReduceListArg = LtGCFunc->getArg(2);
3065
3066 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3067 BufferArg->getName() + ".addr");
3068 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3069 IdxArg->getName() + ".addr");
3070 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3071 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3073 BufferArgAlloca, Builder.getPtrTy(),
3074 BufferArgAlloca->getName() + ".ascast");
3076 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3077 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3078 ReduceListArgAlloca, Builder.getPtrTy(),
3079 ReduceListArgAlloca->getName() + ".ascast");
3080
3081 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3082 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3083 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3084
3085 Value *LocalReduceList =
3086 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3087 Value *BufferArgVal =
3088 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3089 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3090 Type *IndexTy = Builder.getIndexTy(
3092 for (auto En : enumerate(ReductionInfos)) {
3093 const ReductionInfo &RI = En.value();
3094 auto *RedListArrayTy =
3095 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3096 // Reduce element = LocalReduceList[i]
3097 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3098 RedListArrayTy, LocalReduceList,
3099 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3100 // elemptr = ((CopyType*)(elemptrptr)) + I
3101 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3102
3103 // Global = Buffer.VD[Idx];
3104 Value *BufferVD =
3105 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3107 ReductionsBufferTy, BufferVD, 0, En.index());
3108
3109 switch (RI.EvaluationKind) {
3110 case EvalKind::Scalar: {
3111 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3112 Builder.CreateStore(TargetElement, GlobVal);
3113 break;
3114 }
3115 case EvalKind::Complex: {
3117 RI.ElementType, ElemPtr, 0, 0, ".realp");
3118 Value *SrcReal = Builder.CreateLoad(
3119 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3121 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3122 Value *SrcImg = Builder.CreateLoad(
3123 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3124
3126 RI.ElementType, GlobVal, 0, 0, ".realp");
3128 RI.ElementType, GlobVal, 0, 1, ".imagp");
3129 Builder.CreateStore(SrcReal, DestRealPtr);
3130 Builder.CreateStore(SrcImg, DestImgPtr);
3131 break;
3132 }
3133 case EvalKind::Aggregate: {
3134 Value *SizeVal =
3135 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3137 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3138 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3139 break;
3140 }
3141 }
3142 }
3143
3145 Builder.restoreIP(OldIP);
3146 return LtGCFunc;
3147}
3148
3149Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3150 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3151 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3153 LLVMContext &Ctx = M.getContext();
3156 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3157 /* IsVarArg */ false);
3158 Function *LtGRFunc =
3160 "_omp_reduction_list_to_global_reduce_func", &M);
3161 LtGRFunc->setAttributes(FuncAttrs);
3162 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3163 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3164 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3165
3166 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3167 Builder.SetInsertPoint(EntryBlock);
3168
3169 // Buffer: global reduction buffer.
3170 Argument *BufferArg = LtGRFunc->getArg(0);
3171 // Idx: index of the buffer.
3172 Argument *IdxArg = LtGRFunc->getArg(1);
3173 // ReduceList: thread local Reduce list.
3174 Argument *ReduceListArg = LtGRFunc->getArg(2);
3175
3176 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3177 BufferArg->getName() + ".addr");
3178 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3179 IdxArg->getName() + ".addr");
3180 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3181 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3182 auto *RedListArrayTy =
3183 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3184
3185 // 1. Build a list of reduction variables.
3186 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3187 Value *LocalReduceList =
3188 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3189
3191 BufferArgAlloca, Builder.getPtrTy(),
3192 BufferArgAlloca->getName() + ".ascast");
3194 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3195 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3196 ReduceListArgAlloca, Builder.getPtrTy(),
3197 ReduceListArgAlloca->getName() + ".ascast");
3198 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3199 LocalReduceList, Builder.getPtrTy(),
3200 LocalReduceList->getName() + ".ascast");
3201
3202 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3203 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3204 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3205
3206 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3207 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3208 Type *IndexTy = Builder.getIndexTy(
3210 for (auto En : enumerate(ReductionInfos)) {
3211 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3212 RedListArrayTy, LocalReduceListAddrCast,
3213 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3214 Value *BufferVD =
3215 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3216 // Global = Buffer.VD[Idx];
3218 ReductionsBufferTy, BufferVD, 0, En.index());
3219 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3220 }
3221
3222 // Call reduce_function(GlobalReduceList, ReduceList)
3223 Value *ReduceList =
3224 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3225 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3226 ->addFnAttr(Attribute::NoUnwind);
3228 Builder.restoreIP(OldIP);
3229 return LtGRFunc;
3230}
3231
3232Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3233 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3234 AttributeList FuncAttrs) {
3236 LLVMContext &Ctx = M.getContext();
3239 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3240 /* IsVarArg */ false);
3241 Function *LtGCFunc =
3243 "_omp_reduction_global_to_list_copy_func", &M);
3244 LtGCFunc->setAttributes(FuncAttrs);
3245 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3246 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3247 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3248
3249 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3250 Builder.SetInsertPoint(EntryBlock);
3251
3252 // Buffer: global reduction buffer.
3253 Argument *BufferArg = LtGCFunc->getArg(0);
3254 // Idx: index of the buffer.
3255 Argument *IdxArg = LtGCFunc->getArg(1);
3256 // ReduceList: thread local Reduce list.
3257 Argument *ReduceListArg = LtGCFunc->getArg(2);
3258
3259 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3260 BufferArg->getName() + ".addr");
3261 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3262 IdxArg->getName() + ".addr");
3263 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3264 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3266 BufferArgAlloca, Builder.getPtrTy(),
3267 BufferArgAlloca->getName() + ".ascast");
3269 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3270 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3271 ReduceListArgAlloca, Builder.getPtrTy(),
3272 ReduceListArgAlloca->getName() + ".ascast");
3273 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3274 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3275 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3276
3277 Value *LocalReduceList =
3278 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3279 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3280 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3281 Type *IndexTy = Builder.getIndexTy(
3283 for (auto En : enumerate(ReductionInfos)) {
3284 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3285 auto *RedListArrayTy =
3286 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3287 // Reduce element = LocalReduceList[i]
3288 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3289 RedListArrayTy, LocalReduceList,
3290 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3291 // elemptr = ((CopyType*)(elemptrptr)) + I
3292 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3293 // Global = Buffer.VD[Idx];
3294 Value *BufferVD =
3295 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3297 ReductionsBufferTy, BufferVD, 0, En.index());
3298
3299 switch (RI.EvaluationKind) {
3300 case EvalKind::Scalar: {
3301 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3302 Builder.CreateStore(TargetElement, ElemPtr);
3303 break;
3304 }
3305 case EvalKind::Complex: {
3307 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3308 Value *SrcReal = Builder.CreateLoad(
3309 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3311 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3312 Value *SrcImg = Builder.CreateLoad(
3313 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3314
3316 RI.ElementType, ElemPtr, 0, 0, ".realp");
3318 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3319 Builder.CreateStore(SrcReal, DestRealPtr);
3320 Builder.CreateStore(SrcImg, DestImgPtr);
3321 break;
3322 }
3323 case EvalKind::Aggregate: {
3324 Value *SizeVal =
3328 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3329 SizeVal, false);
3330 break;
3331 }
3332 }
3333 }
3334
3336 Builder.restoreIP(OldIP);
3337 return LtGCFunc;
3338}
3339
3340Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3341 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3342 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3344 LLVMContext &Ctx = M.getContext();
3345 auto *FuncTy = FunctionType::get(
3347 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 Function *LtGRFunc =
3351 "_omp_reduction_global_to_list_reduce_func", &M);
3352 LtGRFunc->setAttributes(FuncAttrs);
3353 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3354 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3355 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3356
3357 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3358 Builder.SetInsertPoint(EntryBlock);
3359
3360 // Buffer: global reduction buffer.
3361 Argument *BufferArg = LtGRFunc->getArg(0);
3362 // Idx: index of the buffer.
3363 Argument *IdxArg = LtGRFunc->getArg(1);
3364 // ReduceList: thread local Reduce list.
3365 Argument *ReduceListArg = LtGRFunc->getArg(2);
3366
3367 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3368 BufferArg->getName() + ".addr");
3369 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3370 IdxArg->getName() + ".addr");
3371 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3372 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3373 ArrayType *RedListArrayTy =
3374 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3375
3376 // 1. Build a list of reduction variables.
3377 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3378 Value *LocalReduceList =
3379 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3380
3382 BufferArgAlloca, Builder.getPtrTy(),
3383 BufferArgAlloca->getName() + ".ascast");
3385 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3386 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3387 ReduceListArgAlloca, Builder.getPtrTy(),
3388 ReduceListArgAlloca->getName() + ".ascast");
3390 LocalReduceList, Builder.getPtrTy(),
3391 LocalReduceList->getName() + ".ascast");
3392
3393 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3394 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3395 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3396
3397 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3398 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3399 Type *IndexTy = Builder.getIndexTy(
3401 for (auto En : enumerate(ReductionInfos)) {
3402 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3403 RedListArrayTy, ReductionList,
3404 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3405 // Global = Buffer.VD[Idx];
3406 Value *BufferVD =
3407 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3409 ReductionsBufferTy, BufferVD, 0, En.index());
3410 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3411 }
3412
3413 // Call reduce_function(ReduceList, GlobalReduceList)
3414 Value *ReduceList =
3415 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3416 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3417 ->addFnAttr(Attribute::NoUnwind);
3419 Builder.restoreIP(OldIP);
3420 return LtGRFunc;
3421}
3422
3423std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3424 std::string Suffix =
3425 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3426 return (Name + Suffix).str();
3427}
3428
3429Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3430 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3431 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3432 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3433 {Builder.getPtrTy(), Builder.getPtrTy()},
3434 /* IsVarArg */ false);
3435 std::string Name = getReductionFuncName(ReducerName);
3436 Function *ReductionFunc =
3438 ReductionFunc->setAttributes(FuncAttrs);
3439 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3440 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3441 BasicBlock *EntryBB =
3442 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3443 Builder.SetInsertPoint(EntryBB);
3444
3445 // Need to alloca memory here and deal with the pointers before getting
3446 // LHS/RHS pointers out
3447 Value *LHSArrayPtr = nullptr;
3448 Value *RHSArrayPtr = nullptr;
3449 Argument *Arg0 = ReductionFunc->getArg(0);
3450 Argument *Arg1 = ReductionFunc->getArg(1);
3451 Type *Arg0Type = Arg0->getType();
3452 Type *Arg1Type = Arg1->getType();
3453
3454 Value *LHSAlloca =
3455 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3456 Value *RHSAlloca =
3457 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3459 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3461 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3462 Builder.CreateStore(Arg0, LHSAddrCast);
3463 Builder.CreateStore(Arg1, RHSAddrCast);
3464 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3465 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3466
3467 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3468 Type *IndexTy = Builder.getIndexTy(
3470 SmallVector<Value *> LHSPtrs, RHSPtrs;
3471 for (auto En : enumerate(ReductionInfos)) {
3472 const ReductionInfo &RI = En.value();
3473 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3474 RedArrayTy, RHSArrayPtr,
3475 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3476 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3478 RHSI8Ptr, RI.PrivateVariable->getType(),
3479 RHSI8Ptr->getName() + ".ascast");
3480
3481 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3482 RedArrayTy, LHSArrayPtr,
3483 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3484 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3486 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3487
3489 LHSPtrs.emplace_back(LHSPtr);
3490 RHSPtrs.emplace_back(RHSPtr);
3491 } else {
3492 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3493 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3494 Value *Reduced;
3495 InsertPointOrErrorTy AfterIP =
3496 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3497 if (!AfterIP)
3498 return AfterIP.takeError();
3499 if (!Builder.GetInsertBlock())
3500 return ReductionFunc;
3501 Builder.CreateStore(Reduced, LHSPtr);
3502 }
3503 }
3504
3506 for (auto En : enumerate(ReductionInfos)) {
3507 unsigned Index = En.index();
3508 const ReductionInfo &RI = En.value();
3509 Value *LHSFixupPtr, *RHSFixupPtr;
3510 Builder.restoreIP(RI.ReductionGenClang(
3511 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3512
3513 // Fix the CallBack code genereated to use the correct Values for the LHS
3514 // and RHS
3515 LHSFixupPtr->replaceUsesWithIf(
3516 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3517 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3518 ReductionFunc;
3519 });
3520 RHSFixupPtr->replaceUsesWithIf(
3521 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3522 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3523 ReductionFunc;
3524 });
3525 }
3526
3528 return ReductionFunc;
3529}
3530
3531static void
3533 bool IsGPU) {
3534 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3535 (void)RI;
3536 assert(RI.Variable && "expected non-null variable");
3537 assert(RI.PrivateVariable && "expected non-null private variable");
3538 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3539 "expected non-null reduction generator callback");
3540 if (!IsGPU) {
3541 assert(
3542 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3543 "expected variables and their private equivalents to have the same "
3544 "type");
3545 }
3546 assert(RI.Variable->getType()->isPointerTy() &&
3547 "expected variables to be pointers");
3548 }
3549}
3550
3552 const LocationDescription &Loc, InsertPointTy AllocaIP,
3553 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3554 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3555 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3556 Value *SrcLocInfo) {
3557 if (!updateToLocation(Loc))
3558 return InsertPointTy();
3559 Builder.restoreIP(CodeGenIP);
3560 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3561 LLVMContext &Ctx = M.getContext();
3562
3563 // Source location for the ident struct
3564 if (!SrcLocInfo) {
3565 uint32_t SrcLocStrSize;
3566 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3567 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3568 }
3569
3570 if (ReductionInfos.size() == 0)
3571 return Builder.saveIP();
3572
3573 BasicBlock *ContinuationBlock = nullptr;
3575 // Copied code from createReductions
3576 BasicBlock *InsertBlock = Loc.IP.getBlock();
3577 ContinuationBlock =
3578 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3579 InsertBlock->getTerminator()->eraseFromParent();
3580 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3581 }
3582
3583 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3584 AttributeList FuncAttrs;
3585 AttrBuilder AttrBldr(Ctx);
3586 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3587 AttrBldr.addAttribute(Attr);
3588 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3589 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3590
3591 CodeGenIP = Builder.saveIP();
3592 Expected<Function *> ReductionResult =
3593 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3594 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3595 if (!ReductionResult)
3596 return ReductionResult.takeError();
3597 Function *ReductionFunc = *ReductionResult;
3598 Builder.restoreIP(CodeGenIP);
3599
3600 // Set the grid value in the config needed for lowering later on
3601 if (GridValue.has_value())
3602 Config.setGridValue(GridValue.value());
3603 else
3604 Config.setGridValue(getGridValue(T, ReductionFunc));
3605
3606 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3607 // RedList, shuffle_reduce_func, interwarp_copy_func);
3608 // or
3609 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3610 Value *Res;
3611
3612 // 1. Build a list of reduction variables.
3613 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3614 auto Size = ReductionInfos.size();
3615 Type *PtrTy = PointerType::getUnqual(Ctx);
3616 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3617 CodeGenIP = Builder.saveIP();
3618 Builder.restoreIP(AllocaIP);
3619 Value *ReductionListAlloca =
3620 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3622 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3623 Builder.restoreIP(CodeGenIP);
3624 Type *IndexTy = Builder.getIndexTy(
3626 for (auto En : enumerate(ReductionInfos)) {
3627 const ReductionInfo &RI = En.value();
3628 Value *ElemPtr = Builder.CreateInBoundsGEP(
3629 RedArrayTy, ReductionList,
3630 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3631 Value *CastElem =
3633 Builder.CreateStore(CastElem, ElemPtr);
3634 }
3635 CodeGenIP = Builder.saveIP();
3636 Function *SarFunc =
3637 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3638 Expected<Function *> CopyResult =
3639 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3640 if (!CopyResult)
3641 return CopyResult.takeError();
3642 Function *WcFunc = *CopyResult;
3643 Builder.restoreIP(CodeGenIP);
3644
3645 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3646
3647 unsigned MaxDataSize = 0;
3648 SmallVector<Type *> ReductionTypeArgs;
3649 for (auto En : enumerate(ReductionInfos)) {
3650 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3651 if (Size > MaxDataSize)
3652 MaxDataSize = Size;
3653 ReductionTypeArgs.emplace_back(En.value().ElementType);
3654 }
3655 Value *ReductionDataSize =
3656 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3657 if (!IsTeamsReduction) {
3658 Value *SarFuncCast =
3660 Value *WcFuncCast =
3662 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3663 WcFuncCast};
3665 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3666 Res = Builder.CreateCall(Pv2Ptr, Args);
3667 } else {
3668 CodeGenIP = Builder.saveIP();
3669 StructType *ReductionsBufferTy = StructType::create(
3670 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3671 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3672 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3673 Function *LtGCFunc = emitListToGlobalCopyFunction(
3674 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3675 Function *LtGRFunc = emitListToGlobalReduceFunction(
3676 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3677 Function *GtLCFunc = emitGlobalToListCopyFunction(
3678 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3679 Function *GtLRFunc = emitGlobalToListReduceFunction(
3680 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3681 Builder.restoreIP(CodeGenIP);
3682
3683 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3684 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3685
3686 Value *Args3[] = {SrcLocInfo,
3687 KernelTeamsReductionPtr,
3688 Builder.getInt32(ReductionBufNum),
3689 ReductionDataSize,
3690 RL,
3691 SarFunc,
3692 WcFunc,
3693 LtGCFunc,
3694 LtGRFunc,
3695 GtLCFunc,
3696 GtLRFunc};
3697
3698 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3699 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3700 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3701 }
3702
3703 // 5. Build if (res == 1)
3704 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3705 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3707 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3708
3709 // 6. Build then branch: where we have reduced values in the master
3710 // thread in each team.
3711 // __kmpc_end_reduce{_nowait}(<gtid>);
3712 // break;
3713 emitBlock(ThenBB, CurFunc);
3714
3715 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3716 for (auto En : enumerate(ReductionInfos)) {
3717 const ReductionInfo &RI = En.value();
3718 Value *LHS = RI.Variable;
3719 Value *RHS =
3721
3723 Value *LHSPtr, *RHSPtr;
3725 &LHSPtr, &RHSPtr, CurFunc));
3726
3727 // Fix the CallBack code genereated to use the correct Values for the LHS
3728 // and RHS
3729 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3730 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3731 ReductionFunc;
3732 });
3733 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3734 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3735 ReductionFunc;
3736 });
3737 } else {
3738 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3739 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3740 Value *Reduced;
3741 InsertPointOrErrorTy AfterIP =
3742 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3743 if (!AfterIP)
3744 return AfterIP.takeError();
3745 Builder.CreateStore(Reduced, LHS, false);
3746 }
3747 }
3748 emitBlock(ExitBB, CurFunc);
3749 if (ContinuationBlock) {
3750 Builder.CreateBr(ContinuationBlock);
3751 Builder.SetInsertPoint(ContinuationBlock);
3752 }
3754
3755 return Builder.saveIP();
3756}
3757
3759 Type *VoidTy = Type::getVoidTy(M.getContext());
3760 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3761 auto *FuncTy =
3762 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3764 ".omp.reduction.func", &M);
3765}
3766
3768 Function *ReductionFunc,
3770 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3771 Module *Module = ReductionFunc->getParent();
3772 BasicBlock *ReductionFuncBlock =
3773 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3774 Builder.SetInsertPoint(ReductionFuncBlock);
3775 Value *LHSArrayPtr = nullptr;
3776 Value *RHSArrayPtr = nullptr;
3777 if (IsGPU) {
3778 // Need to alloca memory here and deal with the pointers before getting
3779 // LHS/RHS pointers out
3780 //
3781 Argument *Arg0 = ReductionFunc->getArg(0);
3782 Argument *Arg1 = ReductionFunc->getArg(1);
3783 Type *Arg0Type = Arg0->getType();
3784 Type *Arg1Type = Arg1->getType();
3785
3786 Value *LHSAlloca =
3787 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3788 Value *RHSAlloca =
3789 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3790 Value *LHSAddrCast =
3791 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3792 Value *RHSAddrCast =
3793 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3794 Builder.CreateStore(Arg0, LHSAddrCast);
3795 Builder.CreateStore(Arg1, RHSAddrCast);
3796 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3797 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3798 } else {
3799 LHSArrayPtr = ReductionFunc->getArg(0);
3800 RHSArrayPtr = ReductionFunc->getArg(1);
3801 }
3802
3803 unsigned NumReductions = ReductionInfos.size();
3804 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3805
3806 for (auto En : enumerate(ReductionInfos)) {
3807 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3808 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3809 RedArrayTy, LHSArrayPtr, 0, En.index());
3810 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3812 LHSI8Ptr, RI.Variable->getType());
3813 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3814 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3815 RedArrayTy, RHSArrayPtr, 0, En.index());
3816 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3818 RHSI8Ptr, RI.PrivateVariable->getType());
3819 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3820 Value *Reduced;
3822 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3823 if (!AfterIP)
3824 return AfterIP.takeError();
3825
3826 Builder.restoreIP(*AfterIP);
3827 // TODO: Consider flagging an error.
3828 if (!Builder.GetInsertBlock())
3829 return Error::success();
3830
3831 // store is inside of the reduction region when using by-ref
3832 if (!IsByRef[En.index()])
3833 Builder.CreateStore(Reduced, LHSPtr);
3834 }
3835 Builder.CreateRetVoid();
3836 return Error::success();
3837}
3838
3840 const LocationDescription &Loc, InsertPointTy AllocaIP,
3841 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3842 bool IsNoWait, bool IsTeamsReduction) {
3843 assert(ReductionInfos.size() == IsByRef.size());
3844 if (Config.isGPU())
3845 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3846 IsNoWait, IsTeamsReduction);
3847
3848 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3849
3850 if (!updateToLocation(Loc))
3851 return InsertPointTy();
3852
3853 if (ReductionInfos.size() == 0)
3854 return Builder.saveIP();
3855
3856 BasicBlock *InsertBlock = Loc.IP.getBlock();
3857 BasicBlock *ContinuationBlock =
3858 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3859 InsertBlock->getTerminator()->eraseFromParent();
3860
3861 // Create and populate array of type-erased pointers to private reduction
3862 // values.
3863 unsigned NumReductions = ReductionInfos.size();
3864 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3866 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3867
3868 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3869
3870 for (auto En : enumerate(ReductionInfos)) {
3871 unsigned Index = En.index();
3872 const ReductionInfo &RI = En.value();
3873 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3874 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3875 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3876 }
3877
3878 // Emit a call to the runtime function that orchestrates the reduction.
3879 // Declare the reduction function in the process.
3880 Type *IndexTy = Builder.getIndexTy(
3883 Module *Module = Func->getParent();
3884 uint32_t SrcLocStrSize;
3885 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3886 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3887 return RI.AtomicReductionGen;
3888 });
3889 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3890 CanGenerateAtomic
3891 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3892 : IdentFlag(0));
3893 Value *ThreadId = getOrCreateThreadID(Ident);
3894 Constant *NumVariables = Builder.getInt32(NumReductions);
3895 const DataLayout &DL = Module->getDataLayout();
3896 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3897 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3898 Function *ReductionFunc = getFreshReductionFunc(*Module);
3899 Value *Lock = getOMPCriticalRegionLock(".reduction");
3901 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3902 : RuntimeFunction::OMPRTL___kmpc_reduce);
3903 CallInst *ReduceCall =
3904 Builder.CreateCall(ReduceFunc,
3905 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3906 ReductionFunc, Lock},
3907 "reduce");
3908
3909 // Create final reduction entry blocks for the atomic and non-atomic case.
3910 // Emit IR that dispatches control flow to one of the blocks based on the
3911 // reduction supporting the atomic mode.
3912 BasicBlock *NonAtomicRedBlock =
3913 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3914 BasicBlock *AtomicRedBlock =
3915 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3916 SwitchInst *Switch =
3917 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3918 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3919 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3920
3921 // Populate the non-atomic reduction using the elementwise reduction function.
3922 // This loads the elements from the global and private variables and reduces
3923 // them before storing back the result to the global variable.
3924 Builder.SetInsertPoint(NonAtomicRedBlock);
3925 for (auto En : enumerate(ReductionInfos)) {
3926 const ReductionInfo &RI = En.value();
3928 // We have one less load for by-ref case because that load is now inside of
3929 // the reduction region
3930 Value *RedValue = RI.Variable;
3931 if (!IsByRef[En.index()]) {
3932 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3933 "red.value." + Twine(En.index()));
3934 }
3935 Value *PrivateRedValue =
3937 "red.private.value." + Twine(En.index()));
3938 Value *Reduced;
3939 InsertPointOrErrorTy AfterIP =
3940 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3941 if (!AfterIP)
3942 return AfterIP.takeError();
3943 Builder.restoreIP(*AfterIP);
3944
3945 if (!Builder.GetInsertBlock())
3946 return InsertPointTy();
3947 // for by-ref case, the load is inside of the reduction region
3948 if (!IsByRef[En.index()])
3949 Builder.CreateStore(Reduced, RI.Variable);
3950 }
3951 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3952 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3953 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3954 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3955 Builder.CreateBr(ContinuationBlock);
3956
3957 // Populate the atomic reduction using the atomic elementwise reduction
3958 // function. There are no loads/stores here because they will be happening
3959 // inside the atomic elementwise reduction.
3960 Builder.SetInsertPoint(AtomicRedBlock);
3961 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3962 for (const ReductionInfo &RI : ReductionInfos) {
3965 if (!AfterIP)
3966 return AfterIP.takeError();
3967 Builder.restoreIP(*AfterIP);
3968 if (!Builder.GetInsertBlock())
3969 return InsertPointTy();
3970 }
3971 Builder.CreateBr(ContinuationBlock);
3972 } else {
3974 }
3975
3976 // Populate the outlined reduction function using the elementwise reduction
3977 // function. Partial values are extracted from the type-erased array of
3978 // pointers to private variables.
3979 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3980 IsByRef, /*isGPU=*/false);
3981 if (Err)
3982 return Err;
3983
3984 if (!Builder.GetInsertBlock())
3985 return InsertPointTy();
3986
3987 Builder.SetInsertPoint(ContinuationBlock);
3988 return Builder.saveIP();
3989}
3990
3993 BodyGenCallbackTy BodyGenCB,
3994 FinalizeCallbackTy FiniCB) {
3995 if (!updateToLocation(Loc))
3996 return Loc.IP;
3997
3998 Directive OMPD = Directive::OMPD_master;
3999 uint32_t SrcLocStrSize;
4000 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4001 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4002 Value *ThreadId = getOrCreateThreadID(Ident);
4003 Value *Args[] = {Ident, ThreadId};
4004
4005 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4006 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4007
4008 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4009 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4010
4011 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4012 /*Conditional*/ true, /*hasFinalize*/ true);
4013}
4014
4017 BodyGenCallbackTy BodyGenCB,
4018 FinalizeCallbackTy FiniCB, Value *Filter) {
4019 if (!updateToLocation(Loc))
4020 return Loc.IP;
4021
4022 Directive OMPD = Directive::OMPD_masked;
4023 uint32_t SrcLocStrSize;
4024 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4025 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4026 Value *ThreadId = getOrCreateThreadID(Ident);
4027 Value *Args[] = {Ident, ThreadId, Filter};
4028 Value *ArgsEnd[] = {Ident, ThreadId};
4029
4030 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4031 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4032
4033 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4034 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4035
4036 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4037 /*Conditional*/ true, /*hasFinalize*/ true);
4038}
4039
4041 llvm::FunctionCallee Callee,
4043 const llvm::Twine &Name) {
4044 llvm::CallInst *Call = Builder.CreateCall(
4046 Call->setDoesNotThrow();
4047 return Call;
4048}
4049
4050// Expects input basic block is dominated by BeforeScanBB.
4051// Once Scan directive is encountered, the code after scan directive should be
4052// dominated by AfterScanBB. Scan directive splits the code sequence to
4053// scan and input phase. Based on whether inclusive or exclusive
4054// clause is used in the scan directive and whether input loop or scan loop
4055// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4056// input loop and second is the scan loop. The code generated handles only
4057// inclusive scans now.
4059 const LocationDescription &Loc, InsertPointTy AllocaIP,
4060 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4061 bool IsInclusive, ScanInfo *ScanRedInfo) {
4062 if (ScanRedInfo->OMPFirstScanLoop) {
4063 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4064 ScanVarsType, ScanRedInfo);
4065 if (Err)
4066 return Err;
4067 }
4068 if (!updateToLocation(Loc))
4069 return Loc.IP;
4070
4071 llvm::Value *IV = ScanRedInfo->IV;
4072
4073 if (ScanRedInfo->OMPFirstScanLoop) {
4074 // Emit buffer[i] = red; at the end of the input phase.
4075 for (size_t i = 0; i < ScanVars.size(); i++) {
4076 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4077 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4078 Type *DestTy = ScanVarsType[i];
4079 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4080 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4081
4082 Builder.CreateStore(Src, Val);
4083 }
4084 }
4085 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4086 emitBlock(ScanRedInfo->OMPScanDispatch,
4088
4089 if (!ScanRedInfo->OMPFirstScanLoop) {
4090 IV = ScanRedInfo->IV;
4091 // Emit red = buffer[i]; at the entrance to the scan phase.
4092 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4093 for (size_t i = 0; i < ScanVars.size(); i++) {
4094 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4095 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4096 Type *DestTy = ScanVarsType[i];
4097 Value *SrcPtr =
4098 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4099 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4100 Builder.CreateStore(Src, ScanVars[i]);
4101 }
4102 }
4103
4104 // TODO: Update it to CreateBr and remove dead blocks
4105 llvm::Value *CmpI = Builder.getInt1(true);
4106 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4107 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4108 ScanRedInfo->OMPAfterScanBlock);
4109 } else {
4110 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4111 ScanRedInfo->OMPBeforeScanBlock);
4112 }
4113 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4116 return Builder.saveIP();
4117}
4118
4119Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4120 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4121 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4122
4123 Builder.restoreIP(AllocaIP);
4124 // Create the shared pointer at alloca IP.
4125 for (size_t i = 0; i < ScanVars.size(); i++) {
4126 llvm::Value *BuffPtr =
4127 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4128 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4129 }
4130
4131 // Allocate temporary buffer by master thread
4132 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4133 InsertPointTy CodeGenIP) -> Error {
4134 Builder.restoreIP(CodeGenIP);
4135 Value *AllocSpan =
4136 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4137 for (size_t i = 0; i < ScanVars.size(); i++) {
4138 Type *IntPtrTy = Builder.getInt32Ty();
4139 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4140 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4141 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4142 AllocSpan, nullptr, "arr");
4143 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4144 }
4145 return Error::success();
4146 };
4147 // TODO: Perform finalization actions for variables. This has to be
4148 // called for variables which have destructors/finalizers.
4149 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4150
4152 llvm::Value *FilterVal = Builder.getInt32(0);
4154 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4155
4156 if (!AfterIP)
4157 return AfterIP.takeError();
4158 Builder.restoreIP(*AfterIP);
4159 BasicBlock *InputBB = Builder.GetInsertBlock();
4160 if (InputBB->getTerminator())
4162 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4163 if (!AfterIP)
4164 return AfterIP.takeError();
4165 Builder.restoreIP(*AfterIP);
4166
4167 return Error::success();
4168}
4169
4170Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4171 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4172 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4173 InsertPointTy CodeGenIP) -> Error {
4174 Builder.restoreIP(CodeGenIP);
4175 for (ReductionInfo RedInfo : ReductionInfos) {
4176 Value *PrivateVar = RedInfo.PrivateVariable;
4177 Value *OrigVar = RedInfo.Variable;
4178 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4179 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4180
4181 Type *SrcTy = RedInfo.ElementType;
4182 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4183 "arrayOffset");
4184 Value *Src = Builder.CreateLoad(SrcTy, Val);
4185
4186 Builder.CreateStore(Src, OrigVar);
4187 Builder.CreateFree(Buff);
4188 }
4189 return Error::success();
4190 };
4191 // TODO: Perform finalization actions for variables. This has to be
4192 // called for variables which have destructors/finalizers.
4193 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4194
4195 if (ScanRedInfo->OMPScanFinish->getTerminator())
4197 else
4198 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4199
4200 llvm::Value *FilterVal = Builder.getInt32(0);
4202 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4203
4204 if (!AfterIP)
4205 return AfterIP.takeError();
4206 Builder.restoreIP(*AfterIP);
4207 BasicBlock *InputBB = Builder.GetInsertBlock();
4208 if (InputBB->getTerminator())
4210 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4211 if (!AfterIP)
4212 return AfterIP.takeError();
4213 Builder.restoreIP(*AfterIP);
4214 return Error::success();
4215}
4216
4218 const LocationDescription &Loc,
4220 ScanInfo *ScanRedInfo) {
4221
4222 if (!updateToLocation(Loc))
4223 return Loc.IP;
4224 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4225 InsertPointTy CodeGenIP) -> Error {
4226 Builder.restoreIP(CodeGenIP);
4228 // for (int k = 0; k <= ceil(log2(n)); ++k)
4229 llvm::BasicBlock *LoopBB =
4230 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4231 llvm::BasicBlock *ExitBB =
4232 splitBB(Builder, false, "omp.outer.log.scan.exit");
4235 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4237 llvm::Value *Arg =
4238 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4239 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4242 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4243 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4244 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4246 ScanRedInfo->Span,
4247 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4248 Builder.SetInsertPoint(InputBB);
4249 Builder.CreateBr(LoopBB);
4250 emitBlock(LoopBB, CurFn);
4251 Builder.SetInsertPoint(LoopBB);
4252
4253 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4254 // size pow2k = 1;
4256 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4257 InputBB);
4258 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4259 InputBB);
4260 // for (size i = n - 1; i >= 2 ^ k; --i)
4261 // tmp[i] op= tmp[i-pow2k];
4262 llvm::BasicBlock *InnerLoopBB =
4263 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4264 llvm::BasicBlock *InnerExitBB =
4265 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4266 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4267 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4268 emitBlock(InnerLoopBB, CurFn);
4269 Builder.SetInsertPoint(InnerLoopBB);
4271 IVal->addIncoming(NMin1, LoopBB);
4272 for (ReductionInfo RedInfo : ReductionInfos) {
4273 Value *ReductionVal = RedInfo.PrivateVariable;
4274 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4275 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4276 Type *DestTy = RedInfo.ElementType;
4278 Value *LHSPtr =
4279 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4280 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4281 Value *RHSPtr =
4282 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4283 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4284 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4285 llvm::Value *Result;
4286 InsertPointOrErrorTy AfterIP =
4287 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4288 if (!AfterIP)
4289 return AfterIP.takeError();
4290 Builder.CreateStore(Result, LHSPtr);
4291 }
4292 llvm::Value *NextIVal = Builder.CreateNUWSub(
4293 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4294 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4295 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4296 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4297 emitBlock(InnerExitBB, CurFn);
4299 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4300 Counter->addIncoming(Next, Builder.GetInsertBlock());
4301 // pow2k <<= 1;
4302 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4303 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4304 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4305 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4307 return Error::success();
4308 };
4309
4310 // TODO: Perform finalization actions for variables. This has to be
4311 // called for variables which have destructors/finalizers.
4312 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4313
4314 llvm::Value *FilterVal = Builder.getInt32(0);
4316 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4317
4318 if (!AfterIP)
4319 return AfterIP.takeError();
4320 Builder.restoreIP(*AfterIP);
4321 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4322
4323 if (!AfterIP)
4324 return AfterIP.takeError();
4325 Builder.restoreIP(*AfterIP);
4326 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4327 if (Err)
4328 return Err;
4329
4330 return AfterIP;
4331}
4332
4333Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4334 llvm::function_ref<Error()> InputLoopGen,
4335 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4336 ScanInfo *ScanRedInfo) {
4337
4338 {
4339 // Emit loop with input phase:
4340 // for (i: 0..<num_iters>) {
4341 // <input phase>;
4342 // buffer[i] = red;
4343 // }
4344 ScanRedInfo->OMPFirstScanLoop = true;
4345 Error Err = InputLoopGen();
4346 if (Err)
4347 return Err;
4348 }
4349 {
4350 // Emit loop with scan phase:
4351 // for (i: 0..<num_iters>) {
4352 // red = buffer[i];
4353 // <scan phase>;
4354 // }
4355 ScanRedInfo->OMPFirstScanLoop = false;
4356 Error Err = ScanLoopGen(Builder.saveIP());
4357 if (Err)
4358 return Err;
4359 }
4360 return Error::success();
4361}
4362
4363void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4365 ScanRedInfo->OMPScanDispatch =
4366 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4367 ScanRedInfo->OMPAfterScanBlock =
4368 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4369 ScanRedInfo->OMPBeforeScanBlock =
4370 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4371 ScanRedInfo->OMPScanLoopExit =
4372 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4373}
4375 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4376 BasicBlock *PostInsertBefore, const Twine &Name) {
4377 Module *M = F->getParent();
4378 LLVMContext &Ctx = M->getContext();
4379 Type *IndVarTy = TripCount->getType();
4380
4381 // Create the basic block structure.
4382 BasicBlock *Preheader =
4383 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4384 BasicBlock *Header =
4385 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4386 BasicBlock *Cond =
4387 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4388 BasicBlock *Body =
4389 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4390 BasicBlock *Latch =
4391 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4392 BasicBlock *Exit =
4393 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4394 BasicBlock *After =
4395 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4396
4397 // Use specified DebugLoc for new instructions.
4399
4400 Builder.SetInsertPoint(Preheader);
4401 Builder.CreateBr(Header);
4402
4403 Builder.SetInsertPoint(Header);
4404 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4405 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4407
4409 Value *Cmp =
4410 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4411 Builder.CreateCondBr(Cmp, Body, Exit);
4412
4413 Builder.SetInsertPoint(Body);
4414 Builder.CreateBr(Latch);
4415
4416 Builder.SetInsertPoint(Latch);
4417 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4418 "omp_" + Name + ".next", /*HasNUW=*/true);
4419 Builder.CreateBr(Header);
4420 IndVarPHI->addIncoming(Next, Latch);
4421
4422 Builder.SetInsertPoint(Exit);
4423 Builder.CreateBr(After);
4424
4425 // Remember and return the canonical control flow.
4426 LoopInfos.emplace_front();
4427 CanonicalLoopInfo *CL = &LoopInfos.front();
4428
4429 CL->Header = Header;
4430 CL->Cond = Cond;
4431 CL->Latch = Latch;
4432 CL->Exit = Exit;
4433
4434#ifndef NDEBUG
4435 CL->assertOK();
4436#endif
4437 return CL;
4438}
4439
4442 LoopBodyGenCallbackTy BodyGenCB,
4443 Value *TripCount, const Twine &Name) {
4444 BasicBlock *BB = Loc.IP.getBlock();
4445 BasicBlock *NextBB = BB->getNextNode();
4446
4447 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4448 NextBB, NextBB, Name);
4449 BasicBlock *After = CL->getAfter();
4450
4451 // If location is not set, don't connect the loop.
4452 if (updateToLocation(Loc)) {
4453 // Split the loop at the insertion point: Branch to the preheader and move
4454 // every following instruction to after the loop (the After BB). Also, the
4455 // new successor is the loop's after block.
4456 spliceBB(Builder, After, /*CreateBranch=*/false);
4458 }
4459
4460 // Emit the body content. We do it after connecting the loop to the CFG to
4461 // avoid that the callback encounters degenerate BBs.
4462 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4463 return Err;
4464
4465#ifndef NDEBUG
4466 CL->assertOK();
4467#endif
4468 return CL;
4469}
4470
4472 ScanInfos.emplace_front();
4473 ScanInfo *Result = &ScanInfos.front();
4474 return Result;
4475}
4476
4479 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4480 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4481 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4482 LocationDescription ComputeLoc =
4483 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4484 updateToLocation(ComputeLoc);
4485
4487
4489 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4490 ScanRedInfo->Span = TripCount;
4491 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4492 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4493
4494 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4495 Builder.restoreIP(CodeGenIP);
4496 ScanRedInfo->IV = IV;
4497 createScanBBs(ScanRedInfo);
4498 BasicBlock *InputBlock = Builder.GetInsertBlock();
4499 Instruction *Terminator = InputBlock->getTerminator();
4500 assert(Terminator->getNumSuccessors() == 1);
4501 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4502 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4503 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4505 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4506 emitBlock(ScanRedInfo->OMPScanLoopExit,
4508 Builder.CreateBr(ContinueBlock);
4510 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4511 return BodyGenCB(Builder.saveIP(), IV);
4512 };
4513
4514 const auto &&InputLoopGen = [&]() -> Error {
4516 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4517 ComputeIP, Name, true, ScanRedInfo);
4518 if (!LoopInfo)
4519 return LoopInfo.takeError();
4520 Result.push_back(*LoopInfo);
4521 Builder.restoreIP((*LoopInfo)->getAfterIP());
4522 return Error::success();
4523 };
4524 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4526 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4527 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4528 if (!LoopInfo)
4529 return LoopInfo.takeError();
4530 Result.push_back(*LoopInfo);
4531 Builder.restoreIP((*LoopInfo)->getAfterIP());
4532 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4533 return Error::success();
4534 };
4535 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4536 if (Err)
4537 return Err;
4538 return Result;
4539}
4540
4542 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4543 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4544
4545 // Consider the following difficulties (assuming 8-bit signed integers):
4546 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4547 // DO I = 1, 100, 50
4548 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4549 // DO I = 100, 0, -128
4550
4551 // Start, Stop and Step must be of the same integer type.
4552 auto *IndVarTy = cast<IntegerType>(Start->getType());
4553 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4554 assert(IndVarTy == Step->getType() && "Step type mismatch");
4555
4556 updateToLocation(Loc);
4557
4558 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4559 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4560
4561 // Like Step, but always positive.
4562 Value *Incr = Step;
4563
4564 // Distance between Start and Stop; always positive.
4565 Value *Span;
4566
4567 // Condition whether there are no iterations are executed at all, e.g. because
4568 // UB < LB.
4569 Value *ZeroCmp;
4570
4571 if (IsSigned) {
4572 // Ensure that increment is positive. If not, negate and invert LB and UB.
4573 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4574 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4575 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4576 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4577 Span = Builder.CreateSub(UB, LB, "", false, true);
4578 ZeroCmp = Builder.CreateICmp(
4579 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4580 } else {
4581 Span = Builder.CreateSub(Stop, Start, "", true);
4582 ZeroCmp = Builder.CreateICmp(
4583 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4584 }
4585
4586 Value *CountIfLooping;
4587 if (InclusiveStop) {
4588 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4589 } else {
4590 // Avoid incrementing past stop since it could overflow.
4591 Value *CountIfTwo = Builder.CreateAdd(
4592 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4593 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4594 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4595 }
4596
4597 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4598 "omp_" + Name + ".tripcount");
4599}
4600
4602 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4603 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4604 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4605 ScanInfo *ScanRedInfo) {
4606 LocationDescription ComputeLoc =
4607 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4608
4610 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4611
4612 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4613 Builder.restoreIP(CodeGenIP);
4614 Value *Span = Builder.CreateMul(IV, Step);
4615 Value *IndVar = Builder.CreateAdd(Span, Start);
4616 if (InScan)
4617 ScanRedInfo->IV = IndVar;
4618 return BodyGenCB(Builder.saveIP(), IndVar);
4619 };
4620 LocationDescription LoopLoc =
4621 ComputeIP.isSet()
4622 ? Loc
4625 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4626}
4627
4628// Returns an LLVM function to call for initializing loop bounds using OpenMP
4629// static scheduling for composite `distribute parallel for` depending on
4630// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4631// integers as unsigned similarly to CanonicalLoopInfo.
4632static FunctionCallee
4634 OpenMPIRBuilder &OMPBuilder) {
4635 unsigned Bitwidth = Ty->getIntegerBitWidth();
4636 if (Bitwidth == 32)
4637 return OMPBuilder.getOrCreateRuntimeFunction(
4638 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4639 if (Bitwidth == 64)
4640 return OMPBuilder.getOrCreateRuntimeFunction(
4641 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4642 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4643}
4644
4645// Returns an LLVM function to call for initializing loop bounds using OpenMP
4646// static scheduling depending on `type`. Only i32 and i64 are supported by the
4647// runtime. Always interpret integers as unsigned similarly to
4648// CanonicalLoopInfo.
4650 OpenMPIRBuilder &OMPBuilder) {
4651 unsigned Bitwidth = Ty->getIntegerBitWidth();
4652 if (Bitwidth == 32)
4653 return OMPBuilder.getOrCreateRuntimeFunction(
4654 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4655 if (Bitwidth == 64)
4656 return OMPBuilder.getOrCreateRuntimeFunction(
4657 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4658 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4659}
4660
4661OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4662 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4663 WorksharingLoopType LoopType, bool NeedsBarrier) {
4664 assert(CLI->isValid() && "Requires a valid canonical loop");
4665 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4666 "Require dedicated allocate IP");
4667
4668 // Set up the source location value for OpenMP runtime.
4671
4672 uint32_t SrcLocStrSize;
4673 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4674 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4675
4676 // Declare useful OpenMP runtime functions.
4677 Value *IV = CLI->getIndVar();
4678 Type *IVTy = IV->getType();
4679 FunctionCallee StaticInit =
4680 LoopType == WorksharingLoopType::DistributeForStaticLoop
4681 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4682 : getKmpcForStaticInitForType(IVTy, M, *this);
4683 FunctionCallee StaticFini =
4684 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4685
4686 // Allocate space for computed loop bounds as expected by the "init" function.
4687 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4688
4689 Type *I32Type = Type::getInt32Ty(M.getContext());
4690 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4691 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4692 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4693 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4694 CLI->setLastIter(PLastIter);
4695
4696 // At the end of the preheader, prepare for calling the "init" function by
4697 // storing the current loop bounds into the allocated space. A canonical loop
4698 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4699 // and produces an inclusive upper bound.
4701 Constant *Zero = ConstantInt::get(IVTy, 0);
4702 Constant *One = ConstantInt::get(IVTy, 1);
4703 Builder.CreateStore(Zero, PLowerBound);
4704 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4705 Builder.CreateStore(UpperBound, PUpperBound);
4706 Builder.CreateStore(One, PStride);
4707
4708 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4709
4710 OMPScheduleType SchedType =
4711 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4712 ? OMPScheduleType::OrderedDistribute
4713 : OMPScheduleType::UnorderedStatic;
4714 Constant *SchedulingType =
4715 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4716
4717 // Call the "init" function and update the trip count of the loop with the
4718 // value it produced.
4720 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4721 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4722 Value *PDistUpperBound =
4723 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4724 Args.push_back(PDistUpperBound);
4725 }
4726 Args.append({PStride, One, Zero});
4727 Builder.CreateCall(StaticInit, Args);
4728 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4729 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4730 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4731 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4732 CLI->setTripCount(TripCount);
4733
4734 // Update all uses of the induction variable except the one in the condition
4735 // block that compares it with the actual upper bound, and the increment in
4736 // the latch block.
4737
4738 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4740 CLI->getBody()->getFirstInsertionPt());
4742 return Builder.CreateAdd(OldIV, LowerBound);
4743 });
4744
4745 // In the "exit" block, call the "fini" function.
4747 CLI->getExit()->getTerminator()->getIterator());
4748 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4749
4750 // Add the barrier if requested.
4751 if (NeedsBarrier) {
4752 InsertPointOrErrorTy BarrierIP =
4753 createBarrier(LocationDescription(Builder.saveIP(), DL),
4754 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4755 /* CheckCancelFlag */ false);
4756 if (!BarrierIP)
4757 return BarrierIP.takeError();
4758 }
4759
4760 InsertPointTy AfterIP = CLI->getAfterIP();
4761 CLI->invalidate();
4762
4763 return AfterIP;
4764}
4765
4767OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4768 CanonicalLoopInfo *CLI,
4769 InsertPointTy AllocaIP,
4770 bool NeedsBarrier,
4771 Value *ChunkSize) {
4772 assert(CLI->isValid() && "Requires a valid canonical loop");
4773 assert(ChunkSize && "Chunk size is required");
4774
4775 LLVMContext &Ctx = CLI->getFunction()->getContext();
4776 Value *IV = CLI->getIndVar();
4777 Value *OrigTripCount = CLI->getTripCount();
4778 Type *IVTy = IV->getType();
4779 assert(IVTy->getIntegerBitWidth() <= 64 &&
4780 "Max supported tripcount bitwidth is 64 bits");
4781 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4782 : Type::getInt64Ty(Ctx);
4783 Type *I32Type = Type::getInt32Ty(M.getContext());
4784 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4785 Constant *One = ConstantInt::get(InternalIVTy, 1);
4786
4787 // Declare useful OpenMP runtime functions.
4788 FunctionCallee StaticInit =
4789 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4790 FunctionCallee StaticFini =
4791 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4792
4793 // Allocate space for computed loop bounds as expected by the "init" function.
4794 Builder.restoreIP(AllocaIP);
4796 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4797 Value *PLowerBound =
4798 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4799 Value *PUpperBound =
4800 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4801 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4802 CLI->setLastIter(PLastIter);
4803
4804 // Set up the source location value for the OpenMP runtime.
4807
4808 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4809 Value *CastedChunkSize =
4810 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4811 Value *CastedTripCount =
4812 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4813
4814 Constant *SchedulingType = ConstantInt::get(
4815 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4816 Builder.CreateStore(Zero, PLowerBound);
4817 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4818 Builder.CreateStore(OrigUpperBound, PUpperBound);
4819 Builder.CreateStore(One, PStride);
4820
4821 // Call the "init" function and update the trip count of the loop with the
4822 // value it produced.
4823 uint32_t SrcLocStrSize;
4824 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4825 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4826 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4827 Builder.CreateCall(StaticInit,
4828 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4829 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4830 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4831 /*pstride=*/PStride, /*incr=*/One,
4832 /*chunk=*/CastedChunkSize});
4833
4834 // Load values written by the "init" function.
4835 Value *FirstChunkStart =
4836 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4837 Value *FirstChunkStop =
4838 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4839 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4840 Value *ChunkRange =
4841 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4842 Value *NextChunkStride =
4843 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4844
4845 // Create outer "dispatch" loop for enumerating the chunks.
4846 BasicBlock *DispatchEnter = splitBB(Builder, true);
4847 Value *DispatchCounter;
4848
4849 // It is safe to assume this didn't return an error because the callback
4850 // passed into createCanonicalLoop is the only possible error source, and it
4851 // always returns success.
4853 {Builder.saveIP(), DL},
4854 [&](InsertPointTy BodyIP, Value *Counter) {
4855 DispatchCounter = Counter;
4856 return Error::success();
4857 },
4858 FirstChunkStart, CastedTripCount, NextChunkStride,
4859 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4860 "dispatch"));
4861
4862 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4863 // not have to preserve the canonical invariant.
4864 BasicBlock *DispatchBody = DispatchCLI->getBody();
4865 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4866 BasicBlock *DispatchExit = DispatchCLI->getExit();
4867 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4868 DispatchCLI->invalidate();
4869
4870 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4871 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4872 redirectTo(CLI->getExit(), DispatchLatch, DL);
4873 redirectTo(DispatchBody, DispatchEnter, DL);
4874
4875 // Prepare the prolog of the chunk loop.
4878
4879 // Compute the number of iterations of the chunk loop.
4881 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4882 Value *IsLastChunk =
4883 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4884 Value *CountUntilOrigTripCount =
4885 Builder.CreateSub(CastedTripCount, DispatchCounter);
4886 Value *ChunkTripCount = Builder.CreateSelect(
4887 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4888 Value *BackcastedChunkTC =
4889 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4890 CLI->setTripCount(BackcastedChunkTC);
4891
4892 // Update all uses of the induction variable except the one in the condition
4893 // block that compares it with the actual upper bound, and the increment in
4894 // the latch block.
4895 Value *BackcastedDispatchCounter =
4896 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4897 CLI->mapIndVar([&](Instruction *) -> Value * {
4898 Builder.restoreIP(CLI->getBodyIP());
4899 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4900 });
4901
4902 // In the "exit" block, call the "fini" function.
4903 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4904 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4905
4906 // Add the barrier if requested.
4907 if (NeedsBarrier) {
4908 InsertPointOrErrorTy AfterIP =
4909 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4910 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4911 if (!AfterIP)
4912 return AfterIP.takeError();
4913 }
4914
4915#ifndef NDEBUG
4916 // Even though we currently do not support applying additional methods to it,
4917 // the chunk loop should remain a canonical loop.
4918 CLI->assertOK();
4919#endif
4920
4921 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4922}
4923
4924// Returns an LLVM function to call for executing an OpenMP static worksharing
4925// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4926// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4927static FunctionCallee
4929 WorksharingLoopType LoopType) {
4930 unsigned Bitwidth = Ty->getIntegerBitWidth();
4931 Module &M = OMPBuilder->M;
4932 switch (LoopType) {
4933 case WorksharingLoopType::ForStaticLoop:
4934 if (Bitwidth == 32)
4935 return OMPBuilder->getOrCreateRuntimeFunction(
4936 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4937 if (Bitwidth == 64)
4938 return OMPBuilder->getOrCreateRuntimeFunction(
4939 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4940 break;
4941 case WorksharingLoopType::DistributeStaticLoop:
4942 if (Bitwidth == 32)
4943 return OMPBuilder->getOrCreateRuntimeFunction(
4944 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4945 if (Bitwidth == 64)
4946 return OMPBuilder->getOrCreateRuntimeFunction(
4947 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4948 break;
4949 case WorksharingLoopType::DistributeForStaticLoop:
4950 if (Bitwidth == 32)
4951 return OMPBuilder->getOrCreateRuntimeFunction(
4952 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4953 if (Bitwidth == 64)
4954 return OMPBuilder->getOrCreateRuntimeFunction(
4955 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4956 break;
4957 }
4958 if (Bitwidth != 32 && Bitwidth != 64) {
4959 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4960 }
4961 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4962}
4963
4964// Inserts a call to proper OpenMP Device RTL function which handles
4965// loop worksharing.
4967 WorksharingLoopType LoopType,
4968 BasicBlock *InsertBlock, Value *Ident,
4969 Value *LoopBodyArg, Value *TripCount,
4970 Function &LoopBodyFn) {
4971 Type *TripCountTy = TripCount->getType();
4972 Module &M = OMPBuilder->M;
4973 IRBuilder<> &Builder = OMPBuilder->Builder;
4974 FunctionCallee RTLFn =
4975 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4976 SmallVector<Value *, 8> RealArgs;
4977 RealArgs.push_back(Ident);
4978 RealArgs.push_back(&LoopBodyFn);
4979 RealArgs.push_back(LoopBodyArg);
4980 RealArgs.push_back(TripCount);
4981 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4982 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4983 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4984 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4985 Builder.CreateCall(RTLFn, RealArgs);
4986 return;
4987 }
4988 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4989 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4990 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4991 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4992
4993 RealArgs.push_back(
4994 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4995 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4996 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4997 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4998 }
4999 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5000
5001 Builder.CreateCall(RTLFn, RealArgs);
5002}
5003
5005 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5006 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5007 WorksharingLoopType LoopType) {
5008 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5009 BasicBlock *Preheader = CLI->getPreheader();
5010 Value *TripCount = CLI->getTripCount();
5011
5012 // After loop body outling, the loop body contains only set up
5013 // of loop body argument structure and the call to the outlined
5014 // loop body function. Firstly, we need to move setup of loop body args
5015 // into loop preheader.
5016 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5017 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5018
5019 // The next step is to remove the whole loop. We do not it need anymore.
5020 // That's why make an unconditional branch from loop preheader to loop
5021 // exit block
5022 Builder.restoreIP({Preheader, Preheader->end()});
5023 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5024 Preheader->getTerminator()->eraseFromParent();
5025 Builder.CreateBr(CLI->getExit());
5026
5027 // Delete dead loop blocks
5028 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5029 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5030 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5031 CleanUpInfo.EntryBB = CLI->getHeader();
5032 CleanUpInfo.ExitBB = CLI->getExit();
5033 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5034 DeleteDeadBlocks(BlocksToBeRemoved);
5035
5036 // Find the instruction which corresponds to loop body argument structure
5037 // and remove the call to loop body function instruction.
5038 Value *LoopBodyArg;
5039 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5040 assert(OutlinedFnUser &&
5041 "Expected unique undroppable user of outlined function");
5042 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5043 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5044 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5045 "Expected outlined function call to be located in loop preheader");
5046 // Check in case no argument structure has been passed.
5047 if (OutlinedFnCallInstruction->arg_size() > 1)
5048 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5049 else
5050 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5051 OutlinedFnCallInstruction->eraseFromParent();
5052
5053 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5054 LoopBodyArg, TripCount, OutlinedFn);
5055
5056 for (auto &ToBeDeletedItem : ToBeDeleted)
5057 ToBeDeletedItem->eraseFromParent();
5058 CLI->invalidate();
5059}
5060
5062OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5063 InsertPointTy AllocaIP,
5064 WorksharingLoopType LoopType) {
5065 uint32_t SrcLocStrSize;
5066 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5067 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5068
5069 OutlineInfo OI;
5070 OI.OuterAllocaBB = CLI->getPreheader();
5071 Function *OuterFn = CLI->getPreheader()->getParent();
5072
5073 // Instructions which need to be deleted at the end of code generation
5075
5076 OI.OuterAllocaBB = AllocaIP.getBlock();
5077
5078 // Mark the body loop as region which needs to be extracted
5079 OI.EntryBB = CLI->getBody();
5080 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5081 "omp.prelatch", true);
5082
5083 // Prepare loop body for extraction
5084 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5085
5086 // Insert new loop counter variable which will be used only in loop
5087 // body.
5088 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5089 Instruction *NewLoopCntLoad =
5090 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5091 // New loop counter instructions are redundant in the loop preheader when
5092 // code generation for workshare loop is finshed. That's why mark them as
5093 // ready for deletion.
5094 ToBeDeleted.push_back(NewLoopCntLoad);
5095 ToBeDeleted.push_back(NewLoopCnt);
5096
5097 // Analyse loop body region. Find all input variables which are used inside
5098 // loop body region.
5099 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5101 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5102
5103 CodeExtractorAnalysisCache CEAC(*OuterFn);
5104 CodeExtractor Extractor(Blocks,
5105 /* DominatorTree */ nullptr,
5106 /* AggregateArgs */ true,
5107 /* BlockFrequencyInfo */ nullptr,
5108 /* BranchProbabilityInfo */ nullptr,
5109 /* AssumptionCache */ nullptr,
5110 /* AllowVarArgs */ true,
5111 /* AllowAlloca */ true,
5112 /* AllocationBlock */ CLI->getPreheader(),
5113 /* Suffix */ ".omp_wsloop",
5114 /* AggrArgsIn0AddrSpace */ true);
5115
5116 BasicBlock *CommonExit = nullptr;
5117 SetVector<Value *> SinkingCands, HoistingCands;
5118
5119 // Find allocas outside the loop body region which are used inside loop
5120 // body
5121 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5122
5123 // We need to model loop body region as the function f(cnt, loop_arg).
5124 // That's why we replace loop induction variable by the new counter
5125 // which will be one of loop body function argument
5127 CLI->getIndVar()->user_end());
5128 for (auto Use : Users) {
5129 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5130 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5131 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5132 }
5133 }
5134 }
5135 // Make sure that loop counter variable is not merged into loop body
5136 // function argument structure and it is passed as separate variable
5137 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5138
5139 // PostOutline CB is invoked when loop body function is outlined and
5140 // loop body is replaced by call to outlined function. We need to add
5141 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5142 // function will handle loop control logic.
5143 //
5144 OI.PostOutlineCB = [=, ToBeDeletedVec =
5145 std::move(ToBeDeleted)](Function &OutlinedFn) {
5146 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5147 LoopType);
5148 };
5149 addOutlineInfo(std::move(OI));
5150 return CLI->getAfterIP();
5151}
5152
5155 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5156 bool HasSimdModifier, bool HasMonotonicModifier,
5157 bool HasNonmonotonicModifier, bool HasOrderedClause,
5158 WorksharingLoopType LoopType) {
5159 if (Config.isTargetDevice())
5160 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5161 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5162 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5163 HasNonmonotonicModifier, HasOrderedClause);
5164
5165 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5166 OMPScheduleType::ModifierOrdered;
5167 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5168 case OMPScheduleType::BaseStatic:
5169 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5170 if (IsOrdered)
5171 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5172 NeedsBarrier, ChunkSize);
5173 // FIXME: Monotonicity ignored?
5174 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5175
5176 case OMPScheduleType::BaseStaticChunked:
5177 if (IsOrdered)
5178 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5179 NeedsBarrier, ChunkSize);
5180 // FIXME: Monotonicity ignored?
5181 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5182 ChunkSize);
5183
5184 case OMPScheduleType::BaseRuntime:
5185 case OMPScheduleType::BaseAuto:
5186 case OMPScheduleType::BaseGreedy:
5187 case OMPScheduleType::BaseBalanced:
5188 case OMPScheduleType::BaseSteal:
5189 case OMPScheduleType::BaseGuidedSimd:
5190 case OMPScheduleType::BaseRuntimeSimd:
5191 assert(!ChunkSize &&
5192 "schedule type does not support user-defined chunk sizes");
5193 [[fallthrough]];
5194 case OMPScheduleType::BaseDynamicChunked:
5195 case OMPScheduleType::BaseGuidedChunked:
5196 case OMPScheduleType::BaseGuidedIterativeChunked:
5197 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5198 case OMPScheduleType::BaseStaticBalancedChunked:
5199 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5200 NeedsBarrier, ChunkSize);
5201
5202 default:
5203 llvm_unreachable("Unknown/unimplemented schedule kind");
5204 }
5205}
5206
5207/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5208/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5209/// the runtime. Always interpret integers as unsigned similarly to
5210/// CanonicalLoopInfo.
5211static FunctionCallee
5213 unsigned Bitwidth = Ty->getIntegerBitWidth();
5214 if (Bitwidth == 32)
5215 return OMPBuilder.getOrCreateRuntimeFunction(
5216 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5217 if (Bitwidth == 64)
5218 return OMPBuilder.getOrCreateRuntimeFunction(
5219 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5220 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5221}
5222
5223/// Returns an LLVM function to call for updating the next loop using OpenMP
5224/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5225/// the runtime. Always interpret integers as unsigned similarly to
5226/// CanonicalLoopInfo.
5227static FunctionCallee
5229 unsigned Bitwidth = Ty->getIntegerBitWidth();
5230 if (Bitwidth == 32)
5231 return OMPBuilder.getOrCreateRuntimeFunction(
5232 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5233 if (Bitwidth == 64)
5234 return OMPBuilder.getOrCreateRuntimeFunction(
5235 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5236 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5237}
5238
5239/// Returns an LLVM function to call for finalizing the dynamic loop using
5240/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5241/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5242static FunctionCallee
5244 unsigned Bitwidth = Ty->getIntegerBitWidth();
5245 if (Bitwidth == 32)
5246 return OMPBuilder.getOrCreateRuntimeFunction(
5247 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5248 if (Bitwidth == 64)
5249 return OMPBuilder.getOrCreateRuntimeFunction(
5250 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5251 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5252}
5253
5255OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5256 InsertPointTy AllocaIP,
5257 OMPScheduleType SchedType,
5258 bool NeedsBarrier, Value *Chunk) {
5259 assert(CLI->isValid() && "Requires a valid canonical loop");
5260 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5261 "Require dedicated allocate IP");
5263 "Require valid schedule type");
5264
5265 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5266 OMPScheduleType::ModifierOrdered;
5267
5268 // Set up the source location value for OpenMP runtime.
5270
5271 uint32_t SrcLocStrSize;
5272 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5273 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5274
5275 // Declare useful OpenMP runtime functions.
5276 Value *IV = CLI->getIndVar();
5277 Type *IVTy = IV->getType();
5278 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5279 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5280
5281 // Allocate space for computed loop bounds as expected by the "init" function.
5282 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5283 Type *I32Type = Type::getInt32Ty(M.getContext());
5284 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5285 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5286 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5287 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5288 CLI->setLastIter(PLastIter);
5289
5290 // At the end of the preheader, prepare for calling the "init" function by
5291 // storing the current loop bounds into the allocated space. A canonical loop
5292 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5293 // and produces an inclusive upper bound.
5294 BasicBlock *PreHeader = CLI->getPreheader();
5295 Builder.SetInsertPoint(PreHeader->getTerminator());
5296 Constant *One = ConstantInt::get(IVTy, 1);
5297 Builder.CreateStore(One, PLowerBound);
5298 Value *UpperBound = CLI->getTripCount();
5299 Builder.CreateStore(UpperBound, PUpperBound);
5300 Builder.CreateStore(One, PStride);
5301
5302 BasicBlock *Header = CLI->getHeader();
5303 BasicBlock *Exit = CLI->getExit();
5304 BasicBlock *Cond = CLI->getCond();
5305 BasicBlock *Latch = CLI->getLatch();
5306 InsertPointTy AfterIP = CLI->getAfterIP();
5307
5308 // The CLI will be "broken" in the code below, as the loop is no longer
5309 // a valid canonical loop.
5310
5311 if (!Chunk)
5312 Chunk = One;
5313
5314 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5315
5316 Constant *SchedulingType =
5317 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5318
5319 // Call the "init" function.
5320 Builder.CreateCall(DynamicInit,
5321 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5322 UpperBound, /* step */ One, Chunk});
5323
5324 // An outer loop around the existing one.
5325 BasicBlock *OuterCond = BasicBlock::Create(
5326 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5327 PreHeader->getParent());
5328 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5329 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5330 Value *Res =
5331 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5332 PLowerBound, PUpperBound, PStride});
5333 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5334 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5335 Value *LowerBound =
5336 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5337 Builder.CreateCondBr(MoreWork, Header, Exit);
5338
5339 // Change PHI-node in loop header to use outer cond rather than preheader,
5340 // and set IV to the LowerBound.
5341 Instruction *Phi = &Header->front();
5342 auto *PI = cast<PHINode>(Phi);
5343 PI->setIncomingBlock(0, OuterCond);
5344 PI->setIncomingValue(0, LowerBound);
5345
5346 // Then set the pre-header to jump to the OuterCond
5347 Instruction *Term = PreHeader->getTerminator();
5348 auto *Br = cast<BranchInst>(Term);
5349 Br->setSuccessor(0, OuterCond);
5350
5351 // Modify the inner condition:
5352 // * Use the UpperBound returned from the DynamicNext call.
5353 // * jump to the loop outer loop when done with one of the inner loops.
5354 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5355 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5357 auto *CI = cast<CmpInst>(Comp);
5358 CI->setOperand(1, UpperBound);
5359 // Redirect the inner exit to branch to outer condition.
5360 Instruction *Branch = &Cond->back();
5361 auto *BI = cast<BranchInst>(Branch);
5362 assert(BI->getSuccessor(1) == Exit);
5363 BI->setSuccessor(1, OuterCond);
5364
5365 // Call the "fini" function if "ordered" is present in wsloop directive.
5366 if (Ordered) {
5367 Builder.SetInsertPoint(&Latch->back());
5368 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5369 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5370 }
5371
5372 // Add the barrier if requested.
5373 if (NeedsBarrier) {
5374 Builder.SetInsertPoint(&Exit->back());
5375 InsertPointOrErrorTy BarrierIP =
5376 createBarrier(LocationDescription(Builder.saveIP(), DL),
5377 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5378 /* CheckCancelFlag */ false);
5379 if (!BarrierIP)
5380 return BarrierIP.takeError();
5381 }
5382
5383 CLI->invalidate();
5384 return AfterIP;
5385}
5386
5387/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5388/// after this \p OldTarget will be orphaned.
5390 BasicBlock *NewTarget, DebugLoc DL) {
5391 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5392 redirectTo(Pred, NewTarget, DL);
5393}
5394
5395/// Determine which blocks in \p BBs are reachable from outside and remove the
5396/// ones that are not reachable from the function.
5399 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5400 for (Use &U : BB->uses()) {
5401 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5402 if (!UseInst)
5403 continue;
5404 if (BBsToErase.count(UseInst->getParent()))
5405 continue;
5406 return true;
5407 }
5408 return false;
5409 };
5410
5411 while (BBsToErase.remove_if(HasRemainingUses)) {
5412 // Try again if anything was removed.
5413 }
5414
5415 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5416 DeleteDeadBlocks(BBVec);
5417}
5418
5421 InsertPointTy ComputeIP) {
5422 assert(Loops.size() >= 1 && "At least one loop required");
5423 size_t NumLoops = Loops.size();
5424
5425 // Nothing to do if there is already just one loop.
5426 if (NumLoops == 1)
5427 return Loops.front();
5428
5429 CanonicalLoopInfo *Outermost = Loops.front();
5430 CanonicalLoopInfo *Innermost = Loops.back();
5431 BasicBlock *OrigPreheader = Outermost->getPreheader();
5432 BasicBlock *OrigAfter = Outermost->getAfter();
5433 Function *F = OrigPreheader->getParent();
5434
5435 // Loop control blocks that may become orphaned later.
5436 SmallVector<BasicBlock *, 12> OldControlBBs;
5437 OldControlBBs.reserve(6 * Loops.size());
5439 Loop->collectControlBlocks(OldControlBBs);
5440
5441 // Setup the IRBuilder for inserting the trip count computation.
5443 if (ComputeIP.isSet())
5444 Builder.restoreIP(ComputeIP);
5445 else
5446 Builder.restoreIP(Outermost->getPreheaderIP());
5447
5448 // Derive the collapsed' loop trip count.
5449 // TODO: Find common/largest indvar type.
5450 Value *CollapsedTripCount = nullptr;
5451 for (CanonicalLoopInfo *L : Loops) {
5452 assert(L->isValid() &&
5453 "All loops to collapse must be valid canonical loops");
5454 Value *OrigTripCount = L->getTripCount();
5455 if (!CollapsedTripCount) {
5456 CollapsedTripCount = OrigTripCount;
5457 continue;
5458 }
5459
5460 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5461 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5462 }
5463
5464 // Create the collapsed loop control flow.
5465 CanonicalLoopInfo *Result =
5466 createLoopSkeleton(DL, CollapsedTripCount, F,
5467 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5468
5469 // Build the collapsed loop body code.
5470 // Start with deriving the input loop induction variables from the collapsed
5471 // one, using a divmod scheme. To preserve the original loops' order, the
5472 // innermost loop use the least significant bits.
5473 Builder.restoreIP(Result->getBodyIP());
5474
5475 Value *Leftover = Result->getIndVar();
5476 SmallVector<Value *> NewIndVars;
5477 NewIndVars.resize(NumLoops);
5478 for (int i = NumLoops - 1; i >= 1; --i) {
5479 Value *OrigTripCount = Loops[i]->getTripCount();
5480
5481 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5482 NewIndVars[i] = NewIndVar;
5483
5484 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5485 }
5486 // Outermost loop gets all the remaining bits.
5487 NewIndVars[0] = Leftover;
5488
5489 // Construct the loop body control flow.
5490 // We progressively construct the branch structure following in direction of
5491 // the control flow, from the leading in-between code, the loop nest body, the
5492 // trailing in-between code, and rejoining the collapsed loop's latch.
5493 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5494 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5495 // its predecessors as sources.
5496 BasicBlock *ContinueBlock = Result->getBody();
5497 BasicBlock *ContinuePred = nullptr;
5498 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5499 BasicBlock *NextSrc) {
5500 if (ContinueBlock)
5501 redirectTo(ContinueBlock, Dest, DL);
5502 else
5503 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5504
5505 ContinueBlock = nullptr;
5506 ContinuePred = NextSrc;
5507 };
5508
5509 // The code before the nested loop of each level.
5510 // Because we are sinking it into the nest, it will be executed more often
5511 // that the original loop. More sophisticated schemes could keep track of what
5512 // the in-between code is and instantiate it only once per thread.
5513 for (size_t i = 0; i < NumLoops - 1; ++i)
5514 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5515
5516 // Connect the loop nest body.
5517 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5518
5519 // The code after the nested loop at each level.
5520 for (size_t i = NumLoops - 1; i > 0; --i)
5521 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5522
5523 // Connect the finished loop to the collapsed loop latch.
5524 ContinueWith(Result->getLatch(), nullptr);
5525
5526 // Replace the input loops with the new collapsed loop.
5527 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5528 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5529
5530 // Replace the input loop indvars with the derived ones.
5531 for (size_t i = 0; i < NumLoops; ++i)
5532 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5533
5534 // Remove unused parts of the input loops.
5535 removeUnusedBlocksFromParent(OldControlBBs);
5536
5537 for (CanonicalLoopInfo *L : Loops)
5538 L->invalidate();
5539
5540#ifndef NDEBUG
5541 Result->assertOK();
5542#endif
5543 return Result;
5544}
5545
5546std::vector<CanonicalLoopInfo *>
5548 ArrayRef<Value *> TileSizes) {
5549 assert(TileSizes.size() == Loops.size() &&
5550 "Must pass as many tile sizes as there are loops");
5551 int NumLoops = Loops.size();
5552 assert(NumLoops >= 1 && "At least one loop to tile required");
5553
5554 CanonicalLoopInfo *OutermostLoop = Loops.front();
5555 CanonicalLoopInfo *InnermostLoop = Loops.back();
5556 Function *F = OutermostLoop->getBody()->getParent();
5557 BasicBlock *InnerEnter = InnermostLoop->getBody();
5558 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5559
5560 // Loop control blocks that may become orphaned later.
5561 SmallVector<BasicBlock *, 12> OldControlBBs;
5562 OldControlBBs.reserve(6 * Loops.size());
5564 Loop->collectControlBlocks(OldControlBBs);
5565
5566 // Collect original trip counts and induction variable to be accessible by
5567 // index. Also, the structure of the original loops is not preserved during
5568 // the construction of the tiled loops, so do it before we scavenge the BBs of
5569 // any original CanonicalLoopInfo.
5570 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5571 for (CanonicalLoopInfo *L : Loops) {
5572 assert(L->isValid() && "All input loops must be valid canonical loops");
5573 OrigTripCounts.push_back(L->getTripCount());
5574 OrigIndVars.push_back(L->getIndVar());
5575 }
5576
5577 // Collect the code between loop headers. These may contain SSA definitions
5578 // that are used in the loop nest body. To be usable with in the innermost
5579 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5580 // these instructions may be executed more often than before the tiling.
5581 // TODO: It would be sufficient to only sink them into body of the
5582 // corresponding tile loop.
5584 for (int i = 0; i < NumLoops - 1; ++i) {
5585 CanonicalLoopInfo *Surrounding = Loops[i];
5586 CanonicalLoopInfo *Nested = Loops[i + 1];
5587
5588 BasicBlock *EnterBB = Surrounding->getBody();
5589 BasicBlock *ExitBB = Nested->getHeader();
5590 InbetweenCode.emplace_back(EnterBB, ExitBB);
5591 }
5592
5593 // Compute the trip counts of the floor loops.
5595 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5596 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5597 for (int i = 0; i < NumLoops; ++i) {
5598 Value *TileSize = TileSizes[i];
5599 Value *OrigTripCount = OrigTripCounts[i];
5600 Type *IVType = OrigTripCount->getType();
5601
5602 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5603 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5604
5605 // 0 if tripcount divides the tilesize, 1 otherwise.
5606 // 1 means we need an additional iteration for a partial tile.
5607 //
5608 // Unfortunately we cannot just use the roundup-formula
5609 // (tripcount + tilesize - 1)/tilesize
5610 // because the summation might overflow. We do not want introduce undefined
5611 // behavior when the untiled loop nest did not.
5612 Value *FloorTripOverflow =
5613 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5614
5615 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5616 Value *FloorTripCount =
5617 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5618 "omp_floor" + Twine(i) + ".tripcount", true);
5619
5620 // Remember some values for later use.
5621 FloorCompleteCount.push_back(FloorCompleteTripCount);
5622 FloorCount.push_back(FloorTripCount);
5623 FloorRems.push_back(FloorTripRem);
5624 }
5625
5626 // Generate the new loop nest, from the outermost to the innermost.
5627 std::vector<CanonicalLoopInfo *> Result;
5628 Result.reserve(NumLoops * 2);
5629
5630 // The basic block of the surrounding loop that enters the nest generated
5631 // loop.
5632 BasicBlock *Enter = OutermostLoop->getPreheader();
5633
5634 // The basic block of the surrounding loop where the inner code should
5635 // continue.
5636 BasicBlock *Continue = OutermostLoop->getAfter();
5637
5638 // Where the next loop basic block should be inserted.
5639 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5640
5641 auto EmbeddNewLoop =
5642 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5643 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5644 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5645 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5646 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5647 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5648
5649 // Setup the position where the next embedded loop connects to this loop.
5650 Enter = EmbeddedLoop->getBody();
5651 Continue = EmbeddedLoop->getLatch();
5652 OutroInsertBefore = EmbeddedLoop->getLatch();
5653 return EmbeddedLoop;
5654 };
5655
5656 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5657 const Twine &NameBase) {
5658 for (auto P : enumerate(TripCounts)) {
5659 CanonicalLoopInfo *EmbeddedLoop =
5660 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5661 Result.push_back(EmbeddedLoop);
5662 }
5663 };
5664
5665 EmbeddNewLoops(FloorCount, "floor");
5666
5667 // Within the innermost floor loop, emit the code that computes the tile
5668 // sizes.
5670 SmallVector<Value *, 4> TileCounts;
5671 for (int i = 0; i < NumLoops; ++i) {
5672 CanonicalLoopInfo *FloorLoop = Result[i];
5673 Value *TileSize = TileSizes[i];
5674
5675 Value *FloorIsEpilogue =
5676 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5677 Value *TileTripCount =
5678 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5679
5680 TileCounts.push_back(TileTripCount);
5681 }
5682
5683 // Create the tile loops.
5684 EmbeddNewLoops(TileCounts, "tile");
5685
5686 // Insert the inbetween code into the body.
5687 BasicBlock *BodyEnter = Enter;
5688 BasicBlock *BodyEntered = nullptr;
5689 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5690 BasicBlock *EnterBB = P.first;
5691 BasicBlock *ExitBB = P.second;
5692
5693 if (BodyEnter)
5694 redirectTo(BodyEnter, EnterBB, DL);
5695 else
5696 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5697
5698 BodyEnter = nullptr;
5699 BodyEntered = ExitBB;
5700 }
5701
5702 // Append the original loop nest body into the generated loop nest body.
5703 if (BodyEnter)
5704 redirectTo(BodyEnter, InnerEnter, DL);
5705 else
5706 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5708
5709 // Replace the original induction variable with an induction variable computed
5710 // from the tile and floor induction variables.
5711 Builder.restoreIP(Result.back()->getBodyIP());
5712 for (int i = 0; i < NumLoops; ++i) {
5713 CanonicalLoopInfo *FloorLoop = Result[i];
5714 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5715 Value *OrigIndVar = OrigIndVars[i];
5716 Value *Size = TileSizes[i];
5717
5718 Value *Scale =
5719 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5720 Value *Shift =
5721 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5722 OrigIndVar->replaceAllUsesWith(Shift);
5723 }
5724
5725 // Remove unused parts of the original loops.
5726 removeUnusedBlocksFromParent(OldControlBBs);
5727
5728 for (CanonicalLoopInfo *L : Loops)
5729 L->invalidate();
5730
5731#ifndef NDEBUG
5732 for (CanonicalLoopInfo *GenL : Result)
5733 GenL->assertOK();
5734#endif
5735 return Result;
5736}
5737
5738/// Attach metadata \p Properties to the basic block described by \p BB. If the
5739/// basic block already has metadata, the basic block properties are appended.
5741 ArrayRef<Metadata *> Properties) {
5742 // Nothing to do if no property to attach.
5743 if (Properties.empty())
5744 return;
5745
5746 LLVMContext &Ctx = BB->getContext();
5747 SmallVector<Metadata *> NewProperties;
5748 NewProperties.push_back(nullptr);
5749
5750 // If the basic block already has metadata, prepend it to the new metadata.
5751 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5752 if (Existing)
5753 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5754
5755 append_range(NewProperties, Properties);
5756 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5757 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5758
5759 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5760}
5761
5762/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5763/// loop already has metadata, the loop properties are appended.
5765 ArrayRef<Metadata *> Properties) {
5766 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5767
5768 // Attach metadata to the loop's latch
5769 BasicBlock *Latch = Loop->getLatch();
5770 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5771 addBasicBlockMetadata(Latch, Properties);
5772}
5773
5774/// Attach llvm.access.group metadata to the memref instructions of \p Block
5775static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5776 LoopInfo &LI) {
5777 for (Instruction &I : *Block) {
5778 if (I.mayReadOrWriteMemory()) {
5779 // TODO: This instruction may already have access group from
5780 // other pragmas e.g. #pragma clang loop vectorize. Append
5781 // so that the existing metadata is not overwritten.
5782 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5783 }
5784 }
5785}
5786
5790 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5791 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5792}
5793
5797 Loop, {
5798 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5799 });
5800}
5801
5802void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5803 Value *IfCond, ValueToValueMapTy &VMap,
5804 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5805 const Twine &NamePrefix) {
5806 Function *F = CanonicalLoop->getFunction();
5807
5808 // We can't do
5809 // if (cond) {
5810 // simd_loop;
5811 // } else {
5812 // non_simd_loop;
5813 // }
5814 // because then the CanonicalLoopInfo would only point to one of the loops:
5815 // leading to other constructs operating on the same loop to malfunction.
5816 // Instead generate
5817 // while (...) {
5818 // if (cond) {
5819 // simd_body;
5820 // } else {
5821 // not_simd_body;
5822 // }
5823 // }
5824 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5825 // body at -O3
5826
5827 // Define where if branch should be inserted
5828 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5829
5830 // Create additional blocks for the if statement
5831 BasicBlock *Cond = SplitBeforeIt->getParent();
5832 llvm::LLVMContext &C = Cond->getContext();
5834 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5836 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5837
5838 // Create if condition branch.
5839 Builder.SetInsertPoint(SplitBeforeIt);
5840 Instruction *BrInstr =
5841 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5842 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5843 // Then block contains branch to omp loop body which needs to be vectorized
5844 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5845 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5846
5847 Builder.SetInsertPoint(ElseBlock);
5848
5849 // Clone loop for the else branch
5851
5852 SmallVector<BasicBlock *, 8> ExistingBlocks;
5853 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5854 ExistingBlocks.push_back(ThenBlock);
5855 ExistingBlocks.append(L->block_begin(), L->block_end());
5856 // Cond is the block that has the if clause condition
5857 // LoopCond is omp_loop.cond
5858 // LoopHeader is omp_loop.header
5859 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5860 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5861 assert(LoopCond && LoopHeader && "Invalid loop structure");
5862 for (BasicBlock *Block : ExistingBlocks) {
5863 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5864 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5865 continue;
5866 }
5867 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5868
5869 // fix name not to be omp.if.then
5870 if (Block == ThenBlock)
5871 NewBB->setName(NamePrefix + ".if.else");
5872
5873 NewBB->moveBefore(CanonicalLoop->getExit());
5874 VMap[Block] = NewBB;
5875 NewBlocks.push_back(NewBB);
5876 }
5877 remapInstructionsInBlocks(NewBlocks, VMap);
5878 Builder.CreateBr(NewBlocks.front());
5879
5880 // The loop latch must have only one predecessor. Currently it is branched to
5881 // from both the 'then' and 'else' branches.
5882 L->getLoopLatch()->splitBasicBlock(
5883 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5884
5885 // Ensure that the then block is added to the loop so we add the attributes in
5886 // the next step
5887 L->addBasicBlockToLoop(ThenBlock, LI);
5888}
5889
5890unsigned
5892 const StringMap<bool> &Features) {
5893 if (TargetTriple.isX86()) {
5894 if (Features.lookup("avx512f"))
5895 return 512;
5896 else if (Features.lookup("avx"))
5897 return 256;
5898 return 128;
5899 }
5900 if (TargetTriple.isPPC())
5901 return 128;
5902 if (TargetTriple.isWasm())
5903 return 128;
5904 return 0;
5905}
5906
5908 MapVector<Value *, Value *> AlignedVars,
5909 Value *IfCond, OrderKind Order,
5910 ConstantInt *Simdlen, ConstantInt *Safelen) {
5912
5913 Function *F = CanonicalLoop->getFunction();
5914
5915 // TODO: We should not rely on pass manager. Currently we use pass manager
5916 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5917 // object. We should have a method which returns all blocks between
5918 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5920 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5921 FAM.registerPass([]() { return LoopAnalysis(); });
5922 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5923
5924 LoopAnalysis LIA;
5925 LoopInfo &&LI = LIA.run(*F, FAM);
5926
5927 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5928 if (AlignedVars.size()) {
5930 for (auto &AlignedItem : AlignedVars) {
5931 Value *AlignedPtr = AlignedItem.first;
5932 Value *Alignment = AlignedItem.second;
5933 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5934 Builder.SetInsertPoint(loadInst->getNextNode());
5935 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5936 Alignment);
5937 }
5938 Builder.restoreIP(IP);
5939 }
5940
5941 if (IfCond) {
5942 ValueToValueMapTy VMap;
5943 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5944 }
5945
5947
5948 // Get the basic blocks from the loop in which memref instructions
5949 // can be found.
5950 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5951 // preferably without running any passes.
5952 for (BasicBlock *Block : L->getBlocks()) {
5953 if (Block == CanonicalLoop->getCond() ||
5954 Block == CanonicalLoop->getHeader())
5955 continue;
5956 Reachable.insert(Block);
5957 }
5958
5959 SmallVector<Metadata *> LoopMDList;
5960
5961 // In presence of finite 'safelen', it may be unsafe to mark all
5962 // the memory instructions parallel, because loop-carried
5963 // dependences of 'safelen' iterations are possible.
5964 // If clause order(concurrent) is specified then the memory instructions
5965 // are marked parallel even if 'safelen' is finite.
5966 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5967 // Add access group metadata to memory-access instructions.
5968 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5969 for (BasicBlock *BB : Reachable)
5970 addSimdMetadata(BB, AccessGroup, LI);
5971 // TODO: If the loop has existing parallel access metadata, have
5972 // to combine two lists.
5973 LoopMDList.push_back(MDNode::get(
5974 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5975 }
5976
5977 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5978 // versions so we can't add the loop attributes in that case.
5979 if (IfCond) {
5980 // we can still add llvm.loop.parallel_access
5981 addLoopMetadata(CanonicalLoop, LoopMDList);
5982 return;
5983 }
5984
5985 // Use the above access group metadata to create loop level
5986 // metadata, which should be distinct for each loop.
5987 ConstantAsMetadata *BoolConst =
5989 LoopMDList.push_back(MDNode::get(
5990 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5991
5992 if (Simdlen || Safelen) {
5993 // If both simdlen and safelen clauses are specified, the value of the
5994 // simdlen parameter must be less than or equal to the value of the safelen
5995 // parameter. Therefore, use safelen only in the absence of simdlen.
5996 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5997 LoopMDList.push_back(
5998 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5999 ConstantAsMetadata::get(VectorizeWidth)}));
6000 }
6001
6002 addLoopMetadata(CanonicalLoop, LoopMDList);
6003}
6004
6005/// Create the TargetMachine object to query the backend for optimization
6006/// preferences.
6007///
6008/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6009/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6010/// needed for the LLVM pass pipline. We use some default options to avoid
6011/// having to pass too many settings from the frontend that probably do not
6012/// matter.
6013///
6014/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6015/// method. If we are going to use TargetMachine for more purposes, especially
6016/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6017/// might become be worth requiring front-ends to pass on their TargetMachine,
6018/// or at least cache it between methods. Note that while fontends such as Clang
6019/// have just a single main TargetMachine per translation unit, "target-cpu" and
6020/// "target-features" that determine the TargetMachine are per-function and can
6021/// be overrided using __attribute__((target("OPTIONS"))).
6022static std::unique_ptr<TargetMachine>
6024 Module *M = F->getParent();
6025
6026 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6027 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6028 const llvm::Triple &Triple = M->getTargetTriple();
6029
6030 std::string Error;
6032 if (!TheTarget)
6033 return {};
6034
6036 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6037 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6038 /*CodeModel=*/std::nullopt, OptLevel));
6039}
6040
6041/// Heuristically determine the best-performant unroll factor for \p CLI. This
6042/// depends on the target processor. We are re-using the same heuristics as the
6043/// LoopUnrollPass.
6045 Function *F = CLI->getFunction();
6046
6047 // Assume the user requests the most aggressive unrolling, even if the rest of
6048 // the code is optimized using a lower setting.
6050 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6051
6053 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6054 FAM.registerPass([]() { return AssumptionAnalysis(); });
6055 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6056 FAM.registerPass([]() { return LoopAnalysis(); });
6057 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6058 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6059 TargetIRAnalysis TIRA;
6060 if (TM)
6061 TIRA = TargetIRAnalysis(
6062 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6063 FAM.registerPass([&]() { return TIRA; });
6064
6065 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6067 ScalarEvolution &&SE = SEA.run(*F, FAM);
6069 DominatorTree &&DT = DTA.run(*F, FAM);
6070 LoopAnalysis LIA;
6071 LoopInfo &&LI = LIA.run(*F, FAM);
6073 AssumptionCache &&AC = ACT.run(*F, FAM);
6075
6076 Loop *L = LI.getLoopFor(CLI->getHeader());
6077 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6078
6080 L, SE, TTI,
6081 /*BlockFrequencyInfo=*/nullptr,
6082 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6083 /*UserThreshold=*/std::nullopt,
6084 /*UserCount=*/std::nullopt,
6085 /*UserAllowPartial=*/true,
6086 /*UserAllowRuntime=*/true,
6087 /*UserUpperBound=*/std::nullopt,
6088 /*UserFullUnrollMaxCount=*/std::nullopt);
6089
6090 UP.Force = true;
6091
6092 // Account for additional optimizations taking place before the LoopUnrollPass
6093 // would unroll the loop.
6096
6097 // Use normal unroll factors even if the rest of the code is optimized for
6098 // size.
6101
6102 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6103 << " Threshold=" << UP.Threshold << "\n"
6104 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6105 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6106 << " PartialOptSizeThreshold="
6107 << UP.PartialOptSizeThreshold << "\n");
6108
6109 // Disable peeling.
6112 /*UserAllowPeeling=*/false,
6113 /*UserAllowProfileBasedPeeling=*/false,
6114 /*UnrollingSpecficValues=*/false);
6115
6117 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6118
6119 // Assume that reads and writes to stack variables can be eliminated by
6120 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6121 // size.
6122 for (BasicBlock *BB : L->blocks()) {
6123 for (Instruction &I : *BB) {
6124 Value *Ptr;
6125 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6126 Ptr = Load->getPointerOperand();
6127 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6128 Ptr = Store->getPointerOperand();
6129 } else
6130 continue;
6131
6132 Ptr = Ptr->stripPointerCasts();
6133
6134 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6135 if (Alloca->getParent() == &F->getEntryBlock())
6136 EphValues.insert(&I);
6137 }
6138 }
6139 }
6140
6141 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6142
6143 // Loop is not unrollable if the loop contains certain instructions.
6144 if (!UCE.canUnroll()) {
6145 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6146 return 1;
6147 }
6148
6149 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6150 << "\n");
6151
6152 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6153 // be able to use it.
6154 int TripCount = 0;
6155 int MaxTripCount = 0;
6156 bool MaxOrZero = false;
6157 unsigned TripMultiple = 0;
6158
6159 bool UseUpperBound = false;
6160 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6161 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6162 UseUpperBound);
6163 unsigned Factor = UP.Count;
6164 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6165
6166 // This function returns 1 to signal to not unroll a loop.
6167 if (Factor == 0)
6168 return 1;
6169 return Factor;
6170}
6171
6173 int32_t Factor,
6174 CanonicalLoopInfo **UnrolledCLI) {
6175 assert(Factor >= 0 && "Unroll factor must not be negative");
6176
6177 Function *F = Loop->getFunction();
6178 LLVMContext &Ctx = F->getContext();
6179
6180 // If the unrolled loop is not used for another loop-associated directive, it
6181 // is sufficient to add metadata for the LoopUnrollPass.
6182 if (!UnrolledCLI) {
6183 SmallVector<Metadata *, 2> LoopMetadata;
6184 LoopMetadata.push_back(
6185 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6186
6187 if (Factor >= 1) {
6189 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6190 LoopMetadata.push_back(MDNode::get(
6191 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6192 }
6193
6194 addLoopMetadata(Loop, LoopMetadata);
6195 return;
6196 }
6197
6198 // Heuristically determine the unroll factor.
6199 if (Factor == 0)
6201
6202 // No change required with unroll factor 1.
6203 if (Factor == 1) {
6204 *UnrolledCLI = Loop;
6205 return;
6206 }
6207
6208 assert(Factor >= 2 &&
6209 "unrolling only makes sense with a factor of 2 or larger");
6210
6211 Type *IndVarTy = Loop->getIndVarType();
6212
6213 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6214 // unroll the inner loop.
6215 Value *FactorVal =
6216 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6217 /*isSigned=*/false));
6218 std::vector<CanonicalLoopInfo *> LoopNest =
6219 tileLoops(DL, {Loop}, {FactorVal});
6220 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6221 *UnrolledCLI = LoopNest[0];
6222 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6223
6224 // LoopUnrollPass can only fully unroll loops with constant trip count.
6225 // Unroll by the unroll factor with a fallback epilog for the remainder
6226 // iterations if necessary.
6228 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6230 InnerLoop,
6231 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6233 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6234
6235#ifndef NDEBUG
6236 (*UnrolledCLI)->assertOK();
6237#endif
6238}
6239
6242 llvm::Value *BufSize, llvm::Value *CpyBuf,
6243 llvm::Value *CpyFn, llvm::Value *DidIt) {
6244 if (!updateToLocation(Loc))
6245 return Loc.IP;
6246
6247 uint32_t SrcLocStrSize;
6248 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6249 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6250 Value *ThreadId = getOrCreateThreadID(Ident);
6251
6252 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6253
6254 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6255
6256 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6257 Builder.CreateCall(Fn, Args);
6258
6259 return Builder.saveIP();
6260}
6261
6263 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6264 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6266
6267 if (!updateToLocation(Loc))
6268 return Loc.IP;
6269
6270 // If needed allocate and initialize `DidIt` with 0.
6271 // DidIt: flag variable: 1=single thread; 0=not single thread.
6272 llvm::Value *DidIt = nullptr;
6273 if (!CPVars.empty()) {
6276 }
6277
6278 Directive OMPD = Directive::OMPD_single;
6279 uint32_t SrcLocStrSize;
6280 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6281 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6282 Value *ThreadId = getOrCreateThreadID(Ident);
6283 Value *Args[] = {Ident, ThreadId};
6284
6285 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6286 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6287
6288 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6289 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6290
6291 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6292 if (Error Err = FiniCB(IP))
6293 return Err;
6294
6295 // The thread that executes the single region must set `DidIt` to 1.
6296 // This is used by __kmpc_copyprivate, to know if the caller is the
6297 // single thread or not.
6298 if (DidIt)
6300
6301 return Error::success();
6302 };
6303
6304 // generates the following:
6305 // if (__kmpc_single()) {
6306 // .... single region ...
6307 // __kmpc_end_single
6308 // }
6309 // __kmpc_copyprivate
6310 // __kmpc_barrier
6311
6312 InsertPointOrErrorTy AfterIP =
6313 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6314 /*Conditional*/ true,
6315 /*hasFinalize*/ true);
6316 if (!AfterIP)
6317 return AfterIP.takeError();
6318
6319 if (DidIt) {
6320 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6321 // NOTE BufSize is currently unused, so just pass 0.
6323 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6324 CPFuncs[I], DidIt);
6325 // NOTE __kmpc_copyprivate already inserts a barrier
6326 } else if (!IsNowait) {
6327 InsertPointOrErrorTy AfterIP =
6329 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6330 /* CheckCancelFlag */ false);
6331 if (!AfterIP)
6332 return AfterIP.takeError();
6333 }
6334 return Builder.saveIP();
6335}
6336
6338 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6339 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6340
6341 if (!updateToLocation(Loc))
6342 return Loc.IP;
6343
6344 Directive OMPD = Directive::OMPD_critical;
6345 uint32_t SrcLocStrSize;
6346 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6347 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6348 Value *ThreadId = getOrCreateThreadID(Ident);
6349 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6350 Value *Args[] = {Ident, ThreadId, LockVar};
6351
6352 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6353 Function *RTFn = nullptr;
6354 if (HintInst) {
6355 // Add Hint to entry Args and create call
6356 EnterArgs.push_back(HintInst);
6357 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6358 } else {
6359 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6360 }
6361 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6362
6363 Function *ExitRTLFn =
6364 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6365 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6366
6367 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6368 /*Conditional*/ false, /*hasFinalize*/ true);
6369}
6370
6373 InsertPointTy AllocaIP, unsigned NumLoops,
6374 ArrayRef<llvm::Value *> StoreValues,
6375 const Twine &Name, bool IsDependSource) {
6376 assert(
6377 llvm::all_of(StoreValues,
6378 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6379 "OpenMP runtime requires depend vec with i64 type");
6380
6381 if (!updateToLocation(Loc))
6382 return Loc.IP;
6383
6384 // Allocate space for vector and generate alloc instruction.
6385 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6386 Builder.restoreIP(AllocaIP);
6387 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6388 ArgsBase->setAlignment(Align(8));
6389 updateToLocation(Loc);
6390
6391 // Store the index value with offset in depend vector.
6392 for (unsigned I = 0; I < NumLoops; ++I) {
6393 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6394 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6395 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6396 STInst->setAlignment(Align(8));
6397 }
6398
6399 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6400 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6401
6402 uint32_t SrcLocStrSize;
6403 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6404 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6405 Value *ThreadId = getOrCreateThreadID(Ident);
6406 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6407
6408 Function *RTLFn = nullptr;
6409 if (IsDependSource)
6410 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6411 else
6412 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6413 Builder.CreateCall(RTLFn, Args);
6414
6415 return Builder.saveIP();
6416}
6417
6419 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6420 FinalizeCallbackTy FiniCB, bool IsThreads) {
6421 if (!updateToLocation(Loc))
6422 return Loc.IP;
6423
6424 Directive OMPD = Directive::OMPD_ordered;
6425 Instruction *EntryCall = nullptr;
6426 Instruction *ExitCall = nullptr;
6427
6428 if (IsThreads) {
6429 uint32_t SrcLocStrSize;
6430 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6431 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6432 Value *ThreadId = getOrCreateThreadID(Ident);
6433 Value *Args[] = {Ident, ThreadId};
6434
6435 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6436 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6437
6438 Function *ExitRTLFn =
6439 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6440 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6441 }
6442
6443 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6444 /*Conditional*/ false, /*hasFinalize*/ true);
6445}
6446
6447OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6448 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6449 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6450 bool HasFinalize, bool IsCancellable) {
6451
6452 if (HasFinalize)
6453 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6454
6455 // Create inlined region's entry and body blocks, in preparation
6456 // for conditional creation
6457 BasicBlock *EntryBB = Builder.GetInsertBlock();
6458 Instruction *SplitPos = EntryBB->getTerminator();
6459 if (!isa_and_nonnull<BranchInst>(SplitPos))
6460 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6461 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6462 BasicBlock *FiniBB =
6463 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6464
6466 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6467
6468 // generate body
6469 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6470 /* CodeGenIP */ Builder.saveIP()))
6471 return Err;
6472
6473 // emit exit call and do any needed finalization.
6474 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6475 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6476 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6477 "Unexpected control flow graph state!!");
6478 InsertPointOrErrorTy AfterIP =
6479 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6480 if (!AfterIP)
6481 return AfterIP.takeError();
6482 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6483 "Unexpected Control Flow State!");
6485
6486 // If we are skipping the region of a non conditional, remove the exit
6487 // block, and clear the builder's insertion point.
6488 assert(SplitPos->getParent() == ExitBB &&
6489 "Unexpected Insertion point location!");
6490 auto merged = MergeBlockIntoPredecessor(ExitBB);
6491 BasicBlock *ExitPredBB = SplitPos->getParent();
6492 auto InsertBB = merged ? ExitPredBB : ExitBB;
6493 if (!isa_and_nonnull<BranchInst>(SplitPos))
6494 SplitPos->eraseFromParent();
6495 Builder.SetInsertPoint(InsertBB);
6496
6497 return Builder.saveIP();
6498}
6499
6500OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6501 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6502 // if nothing to do, Return current insertion point.
6503 if (!Conditional || !EntryCall)
6504 return Builder.saveIP();
6505
6506 BasicBlock *EntryBB = Builder.GetInsertBlock();
6507 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6508 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6509 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6510
6511 // Emit thenBB and set the Builder's insertion point there for
6512 // body generation next. Place the block after the current block.
6513 Function *CurFn = EntryBB->getParent();
6514 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6515
6516 // Move Entry branch to end of ThenBB, and replace with conditional
6517 // branch (If-stmt)
6518 Instruction *EntryBBTI = EntryBB->getTerminator();
6519 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6520 EntryBBTI->removeFromParent();
6522 Builder.Insert(EntryBBTI);
6523 UI->eraseFromParent();
6525
6526 // return an insertion point to ExitBB.
6527 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6528}
6529
6530OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6531 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6532 bool HasFinalize) {
6533
6534 Builder.restoreIP(FinIP);
6535
6536 // If there is finalization to do, emit it before the exit call
6537 if (HasFinalize) {
6538 assert(!FinalizationStack.empty() &&
6539 "Unexpected finalization stack state!");
6540
6541 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6542 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6543
6544 if (Error Err = Fi.FiniCB(FinIP))
6545 return Err;
6546
6547 BasicBlock *FiniBB = FinIP.getBlock();
6548 Instruction *FiniBBTI = FiniBB->getTerminator();
6549
6550 // set Builder IP for call creation
6551 Builder.SetInsertPoint(FiniBBTI);
6552 }
6553
6554 if (!ExitCall)
6555 return Builder.saveIP();
6556
6557 // place the Exitcall as last instruction before Finalization block terminator
6558 ExitCall->removeFromParent();
6559 Builder.Insert(ExitCall);
6560
6561 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6562 ExitCall->getIterator());
6563}
6564
6566 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6567 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6568 if (!IP.isSet())
6569 return IP;
6570
6572
6573 // creates the following CFG structure
6574 // OMP_Entry : (MasterAddr != PrivateAddr)?
6575 // F T
6576 // | \
6577 // | copin.not.master
6578 // | /
6579 // v /
6580 // copyin.not.master.end
6581 // |
6582 // v
6583 // OMP.Entry.Next
6584
6585 BasicBlock *OMP_Entry = IP.getBlock();
6586 Function *CurFn = OMP_Entry->getParent();
6587 BasicBlock *CopyBegin =
6588 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6589 BasicBlock *CopyEnd = nullptr;
6590
6591 // If entry block is terminated, split to preserve the branch to following
6592 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6593 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6594 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6595 "copyin.not.master.end");
6596 OMP_Entry->getTerminator()->eraseFromParent();
6597 } else {
6598 CopyEnd =
6599 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6600 }
6601
6602 Builder.SetInsertPoint(OMP_Entry);
6603 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6604 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6605 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6606 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6607
6608 Builder.SetInsertPoint(CopyBegin);
6609 if (BranchtoEnd)
6611
6612 return Builder.saveIP();
6613}
6614
6616 Value *Size, Value *Allocator,
6617 std::string Name) {
6619 updateToLocation(Loc);
6620
6621 uint32_t SrcLocStrSize;
6622 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6623 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6624 Value *ThreadId = getOrCreateThreadID(Ident);
6625 Value *Args[] = {ThreadId, Size, Allocator};
6626
6627 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6628
6629 return Builder.CreateCall(Fn, Args, Name);
6630}
6631
6633 Value *Addr, Value *Allocator,
6634 std::string Name) {
6636 updateToLocation(Loc);
6637
6638 uint32_t SrcLocStrSize;
6639 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6640 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6641 Value *ThreadId = getOrCreateThreadID(Ident);
6642 Value *Args[] = {ThreadId, Addr, Allocator};
6643 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6644 return Builder.CreateCall(Fn, Args, Name);
6645}
6646
6648 const LocationDescription &Loc, Value *InteropVar,
6649 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6650 Value *DependenceAddress, bool HaveNowaitClause) {
6652 updateToLocation(Loc);
6653
6654 uint32_t SrcLocStrSize;
6655 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6656 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6657 Value *ThreadId = getOrCreateThreadID(Ident);
6658 if (Device == nullptr)
6660 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6661 if (NumDependences == nullptr) {
6662 NumDependences = ConstantInt::get(Int32, 0);
6663 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6664 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6665 }
6666 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6667 Value *Args[] = {
6668 Ident, ThreadId, InteropVar, InteropTypeVal,
6669 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6670
6671 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6672
6673 return Builder.CreateCall(Fn, Args);
6674}
6675
6677 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6678 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6680 updateToLocation(Loc);
6681
6682 uint32_t SrcLocStrSize;
6683 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6684 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6685 Value *ThreadId = getOrCreateThreadID(Ident);
6686 if (Device == nullptr)
6688 if (NumDependences == nullptr) {
6689 NumDependences = ConstantInt::get(Int32, 0);
6690 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6691 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6692 }
6693 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6694 Value *Args[] = {
6695 Ident, ThreadId, InteropVar, Device,
6696 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6697
6698 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6699
6700 return Builder.CreateCall(Fn, Args);
6701}
6702
6704 Value *InteropVar, Value *Device,
6705 Value *NumDependences,
6706 Value *DependenceAddress,
6707 bool HaveNowaitClause) {
6709 updateToLocation(Loc);
6710 uint32_t SrcLocStrSize;
6711 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6712 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6713 Value *ThreadId = getOrCreateThreadID(Ident);
6714 if (Device == nullptr)
6716 if (NumDependences == nullptr) {
6717 NumDependences = ConstantInt::get(Int32, 0);
6718 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6719 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6720 }
6721 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6722 Value *Args[] = {
6723 Ident, ThreadId, InteropVar, Device,
6724 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6725
6726 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6727
6728 return Builder.CreateCall(Fn, Args);
6729}
6730
6732 const LocationDescription &Loc, llvm::Value *Pointer,
6735 updateToLocation(Loc);
6736
6737 uint32_t SrcLocStrSize;
6738 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6739 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6740 Value *ThreadId = getOrCreateThreadID(Ident);
6741 Constant *ThreadPrivateCache =
6742 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6743 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6744
6745 Function *Fn =
6746 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6747
6748 return Builder.CreateCall(Fn, Args);
6749}
6750
6752 const LocationDescription &Loc,
6754 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6755 "expected num_threads and num_teams to be specified");
6756
6757 if (!updateToLocation(Loc))
6758 return Loc.IP;
6759
6760 uint32_t SrcLocStrSize;
6761 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6762 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6763 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6764 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6765 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6766 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6767 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6768
6769 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6770 Function *Kernel = DebugKernelWrapper;
6771
6772 // We need to strip the debug prefix to get the correct kernel name.
6773 StringRef KernelName = Kernel->getName();
6774 const std::string DebugPrefix = "_debug__";
6775 if (KernelName.ends_with(DebugPrefix)) {
6776 KernelName = KernelName.drop_back(DebugPrefix.length());
6777 Kernel = M.getFunction(KernelName);
6778 assert(Kernel && "Expected the real kernel to exist");
6779 }
6780
6781 // Manifest the launch configuration in the metadata matching the kernel
6782 // environment.
6783 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6784 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6785
6786 // If MaxThreads not set, select the maximum between the default workgroup
6787 // size and the MinThreads value.
6788 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6789 if (MaxThreadsVal < 0)
6790 MaxThreadsVal = std::max(
6791 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6792
6793 if (MaxThreadsVal > 0)
6794 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6795
6796 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6798 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6799 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6800 Constant *ReductionDataSize =
6801 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6802 Constant *ReductionBufferLength =
6803 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6804
6806 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6807 const DataLayout &DL = Fn->getDataLayout();
6808
6809 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6810 Constant *DynamicEnvironmentInitializer =
6811 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6812 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6813 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6814 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6815 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6816 DL.getDefaultGlobalsAddressSpace());
6817 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6818
6819 Constant *DynamicEnvironment =
6820 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6821 ? DynamicEnvironmentGV
6822 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6823 DynamicEnvironmentPtr);
6824
6825 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6826 ConfigurationEnvironment, {
6827 UseGenericStateMachineVal,
6828 MayUseNestedParallelismVal,
6829 IsSPMDVal,
6830 MinThreads,
6831 MaxThreads,
6832 MinTeams,
6833 MaxTeams,
6834 ReductionDataSize,
6835 ReductionBufferLength,
6836 });
6837 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6838 KernelEnvironment, {
6839 ConfigurationEnvironmentInitializer,
6840 Ident,
6841 DynamicEnvironment,
6842 });
6843 std::string KernelEnvironmentName =
6844 (KernelName + "_kernel_environment").str();
6845 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6846 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6847 KernelEnvironmentInitializer, KernelEnvironmentName,
6848 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6849 DL.getDefaultGlobalsAddressSpace());
6850 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6851
6852 Constant *KernelEnvironment =
6853 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6854 ? KernelEnvironmentGV
6855 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6856 KernelEnvironmentPtr);
6857 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6858 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6859 KernelLaunchEnvironment =
6860 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6861 ? KernelLaunchEnvironment
6862 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6863 KernelLaunchEnvParamTy);
6864 CallInst *ThreadKind =
6865 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6866
6867 Value *ExecUserCode = Builder.CreateICmpEQ(
6868 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6869 "exec_user_code");
6870
6871 // ThreadKind = __kmpc_target_init(...)
6872 // if (ThreadKind == -1)
6873 // user_code
6874 // else
6875 // return;
6876
6877 auto *UI = Builder.CreateUnreachable();
6878 BasicBlock *CheckBB = UI->getParent();
6879 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6880
6881 BasicBlock *WorkerExitBB = BasicBlock::Create(
6882 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6883 Builder.SetInsertPoint(WorkerExitBB);
6885
6886 auto *CheckBBTI = CheckBB->getTerminator();
6887 Builder.SetInsertPoint(CheckBBTI);
6888 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6889
6890 CheckBBTI->eraseFromParent();
6891 UI->eraseFromParent();
6892
6893 // Continue in the "user_code" block, see diagram above and in
6894 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6895 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6896}
6897
6899 int32_t TeamsReductionDataSize,
6900 int32_t TeamsReductionBufferLength) {
6901 if (!updateToLocation(Loc))
6902 return;
6903
6905 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6906
6907 Builder.CreateCall(Fn, {});
6908
6909 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6910 return;
6911
6913 // We need to strip the debug prefix to get the correct kernel name.
6914 StringRef KernelName = Kernel->getName();
6915 const std::string DebugPrefix = "_debug__";
6916 if (KernelName.ends_with(DebugPrefix))
6917 KernelName = KernelName.drop_back(DebugPrefix.length());
6918 auto *KernelEnvironmentGV =
6919 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6920 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6921 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6922 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6923 KernelEnvironmentInitializer,
6924 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6925 NewInitializer = ConstantFoldInsertValueInstruction(
6926 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6927 {0, 8});
6928 KernelEnvironmentGV->setInitializer(NewInitializer);
6929}
6930
6932 bool Min) {
6933 if (Kernel.hasFnAttribute(Name)) {
6934 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6935 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6936 }
6937 Kernel.addFnAttr(Name, llvm::utostr(Value));
6938}
6939
6940std::pair<int32_t, int32_t>
6942 int32_t ThreadLimit =
6943 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6944
6945 if (T.isAMDGPU()) {
6946 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6947 if (!Attr.isValid() || !Attr.isStringAttribute())
6948 return {0, ThreadLimit};
6949 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6950 int32_t LB, UB;
6951 if (!llvm::to_integer(UBStr, UB, 10))
6952 return {0, ThreadLimit};
6953 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6954 if (!llvm::to_integer(LBStr, LB, 10))
6955 return {0, UB};
6956 return {LB, UB};
6957 }
6958
6959 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6960 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6961 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6962 }
6963 return {0, ThreadLimit};
6964}
6965
6967 Function &Kernel, int32_t LB,
6968 int32_t UB) {
6969 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6970
6971 if (T.isAMDGPU()) {
6972 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6973 llvm::utostr(LB) + "," + llvm::utostr(UB));
6974 return;
6975 }
6976
6977 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6978}
6979
6980std::pair<int32_t, int32_t>
6982 // TODO: Read from backend annotations if available.
6983 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6984}
6985
6987 int32_t LB, int32_t UB) {
6988 if (T.isNVPTX())
6989 if (UB > 0)
6990 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
6991 if (T.isAMDGPU())
6992 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6993
6994 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6995}
6996
6997void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6998 Function *OutlinedFn) {
6999 if (Config.isTargetDevice()) {
7001 // TODO: Determine if DSO local can be set to true.
7002 OutlinedFn->setDSOLocal(false);
7004 if (T.isAMDGCN())
7006 else if (T.isNVPTX())
7008 else if (T.isSPIRV())
7010 }
7011}
7012
7013Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7014 StringRef EntryFnIDName) {
7015 if (Config.isTargetDevice()) {
7016 assert(OutlinedFn && "The outlined function must exist if embedded");
7017 return OutlinedFn;
7018 }
7019
7020 return new GlobalVariable(
7021 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7022 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7023}
7024
7025Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7026 StringRef EntryFnName) {
7027 if (OutlinedFn)
7028 return OutlinedFn;
7029
7030 assert(!M.getGlobalVariable(EntryFnName, true) &&
7031 "Named kernel already exists?");
7032 return new GlobalVariable(
7033 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7034 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7035}
7036
7038 TargetRegionEntryInfo &EntryInfo,
7039 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7040 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7041
7042 SmallString<64> EntryFnName;
7043 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7044
7046 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7047 if (!CBResult)
7048 return CBResult.takeError();
7049 OutlinedFn = *CBResult;
7050 } else {
7051 OutlinedFn = nullptr;
7052 }
7053
7054 // If this target outline function is not an offload entry, we don't need to
7055 // register it. This may be in the case of a false if clause, or if there are
7056 // no OpenMP targets.
7057 if (!IsOffloadEntry)
7058 return Error::success();
7059
7060 std::string EntryFnIDName =
7062 ? std::string(EntryFnName)
7063 : createPlatformSpecificName({EntryFnName, "region_id"});
7064
7065 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7066 EntryFnName, EntryFnIDName);
7067 return Error::success();
7068}
7069
7071 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7072 StringRef EntryFnName, StringRef EntryFnIDName) {
7073 if (OutlinedFn)
7074 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7075 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7076 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7078 EntryInfo, EntryAddr, OutlinedFnID,
7080 return OutlinedFnID;
7081}
7082
7084 const LocationDescription &Loc, InsertPointTy AllocaIP,
7085 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7086 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7087 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7089 BodyGenTy BodyGenType)>
7090 BodyGenCB,
7091 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7092 if (!updateToLocation(Loc))
7093 return InsertPointTy();
7094
7095 Builder.restoreIP(CodeGenIP);
7096 // Disable TargetData CodeGen on Device pass.
7097 if (Config.IsTargetDevice.value_or(false)) {
7098 if (BodyGenCB) {
7099 InsertPointOrErrorTy AfterIP =
7100 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7101 if (!AfterIP)
7102 return AfterIP.takeError();
7103 Builder.restoreIP(*AfterIP);
7104 }
7105 return Builder.saveIP();
7106 }
7107
7108 bool IsStandAlone = !BodyGenCB;
7109 MapInfosTy *MapInfo;
7110 // Generate the code for the opening of the data environment. Capture all the
7111 // arguments of the runtime call by reference because they are used in the
7112 // closing of the region.
7113 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7114 InsertPointTy CodeGenIP) -> Error {
7115 MapInfo = &GenMapInfoCB(Builder.saveIP());
7116 if (Error Err = emitOffloadingArrays(
7117 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7118 /*IsNonContiguous=*/true, DeviceAddrCB))
7119 return Err;
7120
7121 TargetDataRTArgs RTArgs;
7123
7124 // Emit the number of elements in the offloading arrays.
7125 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7126
7127 // Source location for the ident struct
7128 if (!SrcLocInfo) {
7129 uint32_t SrcLocStrSize;
7130 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7131 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7132 }
7133
7134 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7135 SrcLocInfo, DeviceID,
7136 PointerNum, RTArgs.BasePointersArray,
7137 RTArgs.PointersArray, RTArgs.SizesArray,
7138 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7139 RTArgs.MappersArray};
7140
7141 if (IsStandAlone) {
7142 assert(MapperFunc && "MapperFunc missing for standalone target data");
7143
7144 auto TaskBodyCB = [&](Value *, Value *,
7146 if (Info.HasNoWait) {
7147 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7151 }
7152
7154 OffloadingArgs);
7155
7156 if (Info.HasNoWait) {
7157 BasicBlock *OffloadContBlock =
7158 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7160 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7162 }
7163 return Error::success();
7164 };
7165
7166 bool RequiresOuterTargetTask = Info.HasNoWait;
7167 if (!RequiresOuterTargetTask)
7168 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7169 /*TargetTaskAllocaIP=*/{}));
7170 else
7171 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7172 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7173 } else {
7174 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7175 omp::OMPRTL___tgt_target_data_begin_mapper);
7176
7177 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7178
7179 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7180 if (isa<AllocaInst>(DeviceMap.second.second)) {
7181 auto *LI =
7182 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7183 Builder.CreateStore(LI, DeviceMap.second.second);
7184 }
7185 }
7186
7187 // If device pointer privatization is required, emit the body of the
7188 // region here. It will have to be duplicated: with and without
7189 // privatization.
7190 InsertPointOrErrorTy AfterIP =
7191 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7192 if (!AfterIP)
7193 return AfterIP.takeError();
7194 Builder.restoreIP(*AfterIP);
7195 }
7196 return Error::success();
7197 };
7198
7199 // If we need device pointer privatization, we need to emit the body of the
7200 // region with no privatization in the 'else' branch of the conditional.
7201 // Otherwise, we don't have to do anything.
7202 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7203 InsertPointTy CodeGenIP) -> Error {
7204 InsertPointOrErrorTy AfterIP =
7205 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7206 if (!AfterIP)
7207 return AfterIP.takeError();
7208 Builder.restoreIP(*AfterIP);
7209 return Error::success();
7210 };
7211
7212 // Generate code for the closing of the data region.
7213 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7214 TargetDataRTArgs RTArgs;
7215 Info.EmitDebug = !MapInfo->Names.empty();
7216 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7217
7218 // Emit the number of elements in the offloading arrays.
7219 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7220
7221 // Source location for the ident struct
7222 if (!SrcLocInfo) {
7223 uint32_t SrcLocStrSize;
7224 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7225 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7226 }
7227
7228 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7229 PointerNum, RTArgs.BasePointersArray,
7230 RTArgs.PointersArray, RTArgs.SizesArray,
7231 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7232 RTArgs.MappersArray};
7233 Function *EndMapperFunc =
7234 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7235
7236 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7237 return Error::success();
7238 };
7239
7240 // We don't have to do anything to close the region if the if clause evaluates
7241 // to false.
7242 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7243 return Error::success();
7244 };
7245
7246 Error Err = [&]() -> Error {
7247 if (BodyGenCB) {
7248 Error Err = [&]() {
7249 if (IfCond)
7250 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7251 return BeginThenGen(AllocaIP, Builder.saveIP());
7252 }();
7253
7254 if (Err)
7255 return Err;
7256
7257 // If we don't require privatization of device pointers, we emit the body
7258 // in between the runtime calls. This avoids duplicating the body code.
7259 InsertPointOrErrorTy AfterIP =
7260 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7261 if (!AfterIP)
7262 return AfterIP.takeError();
7263 restoreIPandDebugLoc(Builder, *AfterIP);
7264
7265 if (IfCond)
7266 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7267 return EndThenGen(AllocaIP, Builder.saveIP());
7268 }
7269 if (IfCond)
7270 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7271 return BeginThenGen(AllocaIP, Builder.saveIP());
7272 }();
7273
7274 if (Err)
7275 return Err;
7276
7277 return Builder.saveIP();
7278}
7279
7282 bool IsGPUDistribute) {
7283 assert((IVSize == 32 || IVSize == 64) &&
7284 "IV size is not compatible with the omp runtime");
7286 if (IsGPUDistribute)
7287 Name = IVSize == 32
7288 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7289 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7290 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7291 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7292 else
7293 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7294 : omp::OMPRTL___kmpc_for_static_init_4u)
7295 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7296 : omp::OMPRTL___kmpc_for_static_init_8u);
7297
7299}
7300
7302 bool IVSigned) {
7303 assert((IVSize == 32 || IVSize == 64) &&
7304 "IV size is not compatible with the omp runtime");
7305 RuntimeFunction Name = IVSize == 32
7306 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7307 : omp::OMPRTL___kmpc_dispatch_init_4u)
7308 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7309 : omp::OMPRTL___kmpc_dispatch_init_8u);
7310
7312}
7313
7315 bool IVSigned) {
7316 assert((IVSize == 32 || IVSize == 64) &&
7317 "IV size is not compatible with the omp runtime");
7318 RuntimeFunction Name = IVSize == 32
7319 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7320 : omp::OMPRTL___kmpc_dispatch_next_4u)
7321 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7322 : omp::OMPRTL___kmpc_dispatch_next_8u);
7323
7325}
7326
7328 bool IVSigned) {
7329 assert((IVSize == 32 || IVSize == 64) &&
7330 "IV size is not compatible with the omp runtime");
7331 RuntimeFunction Name = IVSize == 32
7332 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7333 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7334 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7335 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7336
7338}
7339
7341 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7342}
7343
7345 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7346 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7347
7348 DISubprogram *NewSP = Func->getSubprogram();
7349 if (!NewSP)
7350 return;
7351
7353
7354 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7355 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7356 // Only use cached variable if the arg number matches. This is important
7357 // so that DIVariable created for privatized variables are not discarded.
7358 if (NewVar && (arg == NewVar->getArg()))
7359 return NewVar;
7360
7362 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7363 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7364 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7365 return NewVar;
7366 };
7367
7368 auto UpdateDebugRecord = [&](auto *DR) {
7369 DILocalVariable *OldVar = DR->getVariable();
7370 unsigned ArgNo = 0;
7371 for (auto Loc : DR->location_ops()) {
7372 auto Iter = ValueReplacementMap.find(Loc);
7373 if (Iter != ValueReplacementMap.end()) {
7374 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7375 ArgNo = std::get<1>(Iter->second) + 1;
7376 }
7377 }
7378 if (ArgNo != 0)
7379 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7380 };
7381
7382 // The location and scope of variable intrinsics and records still point to
7383 // the parent function of the target region. Update them.
7384 for (Instruction &I : instructions(Func)) {
7385 assert(!isa<llvm::DbgVariableIntrinsic>(&I) &&
7386 "Unexpected debug intrinsic");
7387 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7388 UpdateDebugRecord(&DVR);
7389 }
7390 // An extra argument is passed to the device. Create the debug data for it.
7391 if (OMPBuilder.Config.isTargetDevice()) {
7392 DICompileUnit *CU = NewSP->getUnit();
7393 Module *M = Func->getParent();
7394 DIBuilder DB(*M, true, CU);
7395 DIType *VoidPtrTy =
7396 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7397 DILocalVariable *Var = DB.createParameterVariable(
7398 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7399 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7400 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7401 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7402 &(*Func->begin()));
7403 }
7404}
7405
7407 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7409 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7412 SmallVector<Type *> ParameterTypes;
7413 if (OMPBuilder.Config.isTargetDevice()) {
7414 // Add the "implicit" runtime argument we use to provide launch specific
7415 // information for target devices.
7416 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7417 ParameterTypes.push_back(Int8PtrTy);
7418
7419 // All parameters to target devices are passed as pointers
7420 // or i64. This assumes 64-bit address spaces/pointers.
7421 for (auto &Arg : Inputs)
7422 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7423 ? Arg->getType()
7424 : Type::getInt64Ty(Builder.getContext()));
7425 } else {
7426 for (auto &Arg : Inputs)
7427 ParameterTypes.push_back(Arg->getType());
7428 }
7429
7430 auto BB = Builder.GetInsertBlock();
7431 auto M = BB->getModule();
7432 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7433 /*isVarArg*/ false);
7434 auto Func =
7435 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7436
7437 // Forward target-cpu and target-features function attributes from the
7438 // original function to the new outlined function.
7439 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7440
7441 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7442 if (TargetCpuAttr.isStringAttribute())
7443 Func->addFnAttr(TargetCpuAttr);
7444
7445 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7446 if (TargetFeaturesAttr.isStringAttribute())
7447 Func->addFnAttr(TargetFeaturesAttr);
7448
7449 if (OMPBuilder.Config.isTargetDevice()) {
7450 Value *ExecMode =
7451 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7452 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7453 }
7454
7455 // Save insert point.
7456 IRBuilder<>::InsertPointGuard IPG(Builder);
7457 // We will generate the entries in the outlined function but the debug
7458 // location may still be pointing to the parent function. Reset it now.
7460
7461 // Generate the region into the function.
7462 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7463 Builder.SetInsertPoint(EntryBB);
7464
7465 // Insert target init call in the device compilation pass.
7466 if (OMPBuilder.Config.isTargetDevice())
7467 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7468
7469 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7470
7471 // As we embed the user code in the middle of our target region after we
7472 // generate entry code, we must move what allocas we can into the entry
7473 // block to avoid possible breaking optimisations for device
7474 if (OMPBuilder.Config.isTargetDevice())
7476
7477 // Insert target deinit call in the device compilation pass.
7478 BasicBlock *OutlinedBodyBB =
7479 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7481 Builder.saveIP(),
7482 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7483 if (!AfterIP)
7484 return AfterIP.takeError();
7485 Builder.restoreIP(*AfterIP);
7486 if (OMPBuilder.Config.isTargetDevice())
7487 OMPBuilder.createTargetDeinit(Builder);
7488
7489 // Insert return instruction.
7490 Builder.CreateRetVoid();
7491
7492 // New Alloca IP at entry point of created device function.
7493 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7494 auto AllocaIP = Builder.saveIP();
7495
7496 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7497
7498 // Skip the artificial dyn_ptr on the device.
7499 const auto &ArgRange =
7500 OMPBuilder.Config.isTargetDevice()
7501 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7502 : Func->args();
7503
7505
7506 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7507 // Things like GEP's can come in the form of Constants. Constants and
7508 // ConstantExpr's do not have access to the knowledge of what they're
7509 // contained in, so we must dig a little to find an instruction so we
7510 // can tell if they're used inside of the function we're outlining. We
7511 // also replace the original constant expression with a new instruction
7512 // equivalent; an instruction as it allows easy modification in the
7513 // following loop, as we can now know the constant (instruction) is
7514 // owned by our target function and replaceUsesOfWith can now be invoked
7515 // on it (cannot do this with constants it seems). A brand new one also
7516 // allows us to be cautious as it is perhaps possible the old expression
7517 // was used inside of the function but exists and is used externally
7518 // (unlikely by the nature of a Constant, but still).
7519 // NOTE: We cannot remove dead constants that have been rewritten to
7520 // instructions at this stage, we run the risk of breaking later lowering
7521 // by doing so as we could still be in the process of lowering the module
7522 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7523 // constants we have created rewritten versions of.
7524 if (auto *Const = dyn_cast<Constant>(Input))
7525 convertUsersOfConstantsToInstructions(Const, Func, false);
7526
7527 // Collect users before iterating over them to avoid invalidating the
7528 // iteration in case a user uses Input more than once (e.g. a call
7529 // instruction).
7530 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7531 // Collect all the instructions
7533 if (auto *Instr = dyn_cast<Instruction>(User))
7534 if (Instr->getFunction() == Func)
7535 Instr->replaceUsesOfWith(Input, InputCopy);
7536 };
7537
7538 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7539
7540 // Rewrite uses of input valus to parameters.
7541 for (auto InArg : zip(Inputs, ArgRange)) {
7542 Value *Input = std::get<0>(InArg);
7543 Argument &Arg = std::get<1>(InArg);
7544 Value *InputCopy = nullptr;
7545
7547 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7548 if (!AfterIP)
7549 return AfterIP.takeError();
7550 Builder.restoreIP(*AfterIP);
7551 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7552
7553 // In certain cases a Global may be set up for replacement, however, this
7554 // Global may be used in multiple arguments to the kernel, just segmented
7555 // apart, for example, if we have a global array, that is sectioned into
7556 // multiple mappings (technically not legal in OpenMP, but there is a case
7557 // in Fortran for Common Blocks where this is neccesary), we will end up
7558 // with GEP's into this array inside the kernel, that refer to the Global
7559 // but are technically seperate arguments to the kernel for all intents and
7560 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7561 // index, it will fold into an referal to the Global, if we then encounter
7562 // this folded GEP during replacement all of the references to the
7563 // Global in the kernel will be replaced with the argument we have generated
7564 // that corresponds to it, including any other GEP's that refer to the
7565 // Global that may be other arguments. This will invalidate all of the other
7566 // preceding mapped arguments that refer to the same global that may be
7567 // seperate segments. To prevent this, we defer global processing until all
7568 // other processing has been performed.
7569 if (isa<GlobalValue>(Input)) {
7570 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7571 continue;
7572 }
7573
7574 if (isa<ConstantData>(Input))
7575 continue;
7576
7577 ReplaceValue(Input, InputCopy, Func);
7578 }
7579
7580 // Replace all of our deferred Input values, currently just Globals.
7581 for (auto Deferred : DeferredReplacement)
7582 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7583
7584 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7585 ValueReplacementMap);
7586 return Func;
7587}
7588/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7589/// of pointers containing shared data between the parent task and the created
7590/// task.
7592 IRBuilderBase &Builder,
7593 Value *TaskWithPrivates,
7594 Type *TaskWithPrivatesTy) {
7595
7596 Type *TaskTy = OMPIRBuilder.Task;
7597 LLVMContext &Ctx = Builder.getContext();
7598 Value *TaskT =
7599 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7600 Value *Shareds = TaskT;
7601 // TaskWithPrivatesTy can be one of the following
7602 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7603 // %struct.privates }
7604 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7605 //
7606 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7607 // its first member has to be the task descriptor. TaskTy is the type of the
7608 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7609 // first member of TaskT, gives us the pointer to shared data.
7610 if (TaskWithPrivatesTy != TaskTy)
7611 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7612 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7613}
7614/// Create an entry point for a target task with the following.
7615/// It'll have the following signature
7616/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7617/// This function is called from emitTargetTask once the
7618/// code to launch the target kernel has been outlined already.
7619/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7620/// into the task structure so that the deferred target task can access this
7621/// data even after the stack frame of the generating task has been rolled
7622/// back. Offloading arrays contain base pointers, pointers, sizes etc
7623/// of the data that the target kernel will access. These in effect are the
7624/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7626 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7627 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7628 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7629
7630 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7631 // This is because PrivatesTy is the type of the structure in which
7632 // we pass the offloading arrays to the deferred target task.
7633 assert((!NumOffloadingArrays || PrivatesTy) &&
7634 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7635 "to privatize");
7636
7637 Module &M = OMPBuilder.M;
7638 // KernelLaunchFunction is the target launch function, i.e.
7639 // the function that sets up kernel arguments and calls
7640 // __tgt_target_kernel to launch the kernel on the device.
7641 //
7642 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7643
7644 // StaleCI is the CallInst which is the call to the outlined
7645 // target kernel launch function. If there are local live-in values
7646 // that the outlined function uses then these are aggregated into a structure
7647 // which is passed as the second argument. If there are no local live-in
7648 // values or if all values used by the outlined kernel are global variables,
7649 // then there's only one argument, the threadID. So, StaleCI can be
7650 //
7651 // %structArg = alloca { ptr, ptr }, align 8
7652 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7653 // store ptr %20, ptr %gep_, align 8
7654 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7655 // store ptr %21, ptr %gep_8, align 8
7656 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7657 //
7658 // OR
7659 //
7660 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7662 StaleCI->getIterator());
7663
7664 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7665
7666 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7667 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7668 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7669
7670 auto ProxyFnTy =
7671 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7672 /* isVarArg */ false);
7673 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7674 ".omp_target_task_proxy_func",
7675 Builder.GetInsertBlock()->getModule());
7676 Value *ThreadId = ProxyFn->getArg(0);
7677 Value *TaskWithPrivates = ProxyFn->getArg(1);
7678 ThreadId->setName("thread.id");
7679 TaskWithPrivates->setName("task");
7680
7681 bool HasShareds = SharedArgsOperandNo > 0;
7682 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7683 BasicBlock *EntryBB =
7684 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7685 Builder.SetInsertPoint(EntryBB);
7686
7687 SmallVector<Value *> KernelLaunchArgs;
7688 KernelLaunchArgs.reserve(StaleCI->arg_size());
7689 KernelLaunchArgs.push_back(ThreadId);
7690
7691 if (HasOffloadingArrays) {
7692 assert(TaskTy != TaskWithPrivatesTy &&
7693 "If there are offloading arrays to pass to the target"
7694 "TaskTy cannot be the same as TaskWithPrivatesTy");
7695 (void)TaskTy;
7696 Value *Privates =
7697 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7698 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7699 KernelLaunchArgs.push_back(
7700 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7701 }
7702
7703 if (HasShareds) {
7704 auto *ArgStructAlloca =
7705 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7706 assert(ArgStructAlloca &&
7707 "Unable to find the alloca instruction corresponding to arguments "
7708 "for extracted function");
7709 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7710
7711 AllocaInst *NewArgStructAlloca =
7712 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7713
7714 Value *SharedsSize =
7715 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7716
7718 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7719
7720 Builder.CreateMemCpy(
7721 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7722 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7723 KernelLaunchArgs.push_back(NewArgStructAlloca);
7724 }
7725 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7726 Builder.CreateRetVoid();
7727 return ProxyFn;
7728}
7730
7731 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7732 return GEP->getSourceElementType();
7733 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7734 return Alloca->getAllocatedType();
7735
7736 llvm_unreachable("Unhandled Instruction type");
7737 return nullptr;
7738}
7739// This function returns a struct that has at most two members.
7740// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7741// descriptor. The second member, if needed, is a struct containing arrays
7742// that need to be passed to the offloaded target kernel. For example,
7743// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7744// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7745// respectively, then the types created by this function are
7746//
7747// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7748// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7749// %struct.privates }
7750// %struct.task_with_privates is returned by this function.
7751// If there aren't any offloading arrays to pass to the target kernel,
7752// %struct.kmp_task_ompbuilder_t is returned.
7753static StructType *
7755 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7756
7757 if (OffloadingArraysToPrivatize.empty())
7758 return OMPIRBuilder.Task;
7759
7760 SmallVector<Type *, 4> StructFieldTypes;
7761 for (Value *V : OffloadingArraysToPrivatize) {
7762 assert(V->getType()->isPointerTy() &&
7763 "Expected pointer to array to privatize. Got a non-pointer value "
7764 "instead");
7765 Type *ArrayTy = getOffloadingArrayType(V);
7766 assert(ArrayTy && "ArrayType cannot be nullptr");
7767 StructFieldTypes.push_back(ArrayTy);
7768 }
7769 StructType *PrivatesStructTy =
7770 StructType::create(StructFieldTypes, "struct.privates");
7771 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7772 "struct.task_with_privates");
7773}
7775 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7776 TargetRegionEntryInfo &EntryInfo,
7778 Function *&OutlinedFn, Constant *&OutlinedFnID,
7782
7783 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7784 [&](StringRef EntryFnName) {
7785 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7786 EntryFnName, Inputs, CBFunc,
7787 ArgAccessorFuncCB);
7788 };
7789
7790 return OMPBuilder.emitTargetRegionFunction(
7791 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7792 OutlinedFnID);
7793}
7794
7796 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7799 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7800
7801 // The following explains the code-gen scenario for the `target` directive. A
7802 // similar scneario is followed for other device-related directives (e.g.
7803 // `target enter data`) but in similar fashion since we only need to emit task
7804 // that encapsulates the proper runtime call.
7805 //
7806 // When we arrive at this function, the target region itself has been
7807 // outlined into the function OutlinedFn.
7808 // So at ths point, for
7809 // --------------------------------------------------------------
7810 // void user_code_that_offloads(...) {
7811 // omp target depend(..) map(from:a) map(to:b) private(i)
7812 // do i = 1, 10
7813 // a(i) = b(i) + n
7814 // }
7815 //
7816 // --------------------------------------------------------------
7817 //
7818 // we have
7819 //
7820 // --------------------------------------------------------------
7821 //
7822 // void user_code_that_offloads(...) {
7823 // %.offload_baseptrs = alloca [2 x ptr], align 8
7824 // %.offload_ptrs = alloca [2 x ptr], align 8
7825 // %.offload_mappers = alloca [2 x ptr], align 8
7826 // ;; target region has been outlined and now we need to
7827 // ;; offload to it via a target task.
7828 // }
7829 // void outlined_device_function(ptr a, ptr b, ptr n) {
7830 // n = *n_ptr;
7831 // do i = 1, 10
7832 // a(i) = b(i) + n
7833 // }
7834 //
7835 // We have to now do the following
7836 // (i) Make an offloading call to outlined_device_function using the OpenMP
7837 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7838 // emitted by emitKernelLaunch
7839 // (ii) Create a task entry point function that calls kernel_launch_function
7840 // and is the entry point for the target task. See
7841 // '@.omp_target_task_proxy_func in the pseudocode below.
7842 // (iii) Create a task with the task entry point created in (ii)
7843 //
7844 // That is we create the following
7845 // struct task_with_privates {
7846 // struct kmp_task_ompbuilder_t task_struct;
7847 // struct privates {
7848 // [2 x ptr] ; baseptrs
7849 // [2 x ptr] ; ptrs
7850 // [2 x i64] ; sizes
7851 // }
7852 // }
7853 // void user_code_that_offloads(...) {
7854 // %.offload_baseptrs = alloca [2 x ptr], align 8
7855 // %.offload_ptrs = alloca [2 x ptr], align 8
7856 // %.offload_sizes = alloca [2 x i64], align 8
7857 //
7858 // %structArg = alloca { ptr, ptr, ptr }, align 8
7859 // %strucArg[0] = a
7860 // %strucArg[1] = b
7861 // %strucArg[2] = &n
7862 //
7863 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7864 // sizeof(kmp_task_ompbuilder_t),
7865 // sizeof(structArg),
7866 // @.omp_target_task_proxy_func,
7867 // ...)
7868 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7869 // sizeof(structArg))
7870 // memcpy(target_task_with_privates->privates->baseptrs,
7871 // offload_baseptrs, sizeof(offload_baseptrs)
7872 // memcpy(target_task_with_privates->privates->ptrs,
7873 // offload_ptrs, sizeof(offload_ptrs)
7874 // memcpy(target_task_with_privates->privates->sizes,
7875 // offload_sizes, sizeof(offload_sizes)
7876 // dependencies_array = ...
7877 // ;; if nowait not present
7878 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7879 // call @__kmpc_omp_task_begin_if0(...)
7880 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7881 // %target_task_with_privates)
7882 // call @__kmpc_omp_task_complete_if0(...)
7883 // }
7884 //
7885 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7886 // ptr %task) {
7887 // %structArg = alloca {ptr, ptr, ptr}
7888 // %task_ptr = getelementptr(%task, 0, 0)
7889 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7890 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7891 //
7892 // %offloading_arrays = getelementptr(%task, 0, 1)
7893 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7894 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7895 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7896 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7897 // %offload_sizes, %structArg)
7898 // }
7899 //
7900 // We need the proxy function because the signature of the task entry point
7901 // expected by kmpc_omp_task is always the same and will be different from
7902 // that of the kernel_launch function.
7903 //
7904 // kernel_launch_function is generated by emitKernelLaunch and has the
7905 // always_inline attribute. For this example, it'll look like so:
7906 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7907 // %offload_sizes, %structArg) alwaysinline {
7908 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7909 // ; load aggregated data from %structArg
7910 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7911 // ; offload_sizes
7912 // call i32 @__tgt_target_kernel(...,
7913 // outlined_device_function,
7914 // ptr %kernel_args)
7915 // }
7916 // void outlined_device_function(ptr a, ptr b, ptr n) {
7917 // n = *n_ptr;
7918 // do i = 1, 10
7919 // a(i) = b(i) + n
7920 // }
7921 //
7922 BasicBlock *TargetTaskBodyBB =
7923 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7924 BasicBlock *TargetTaskAllocaBB =
7925 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7926
7927 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7928 TargetTaskAllocaBB->begin());
7929 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7930
7931 OutlineInfo OI;
7932 OI.EntryBB = TargetTaskAllocaBB;
7933 OI.OuterAllocaBB = AllocaIP.getBlock();
7934
7935 // Add the thread ID argument.
7938 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7939
7940 // Generate the task body which will subsequently be outlined.
7941 Builder.restoreIP(TargetTaskBodyIP);
7942 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7943 return Err;
7944
7945 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7946 // it is given. These blocks are enumerated by
7947 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7948 // to be outside the region. In other words, OI.ExitBlock is expected to be
7949 // the start of the region after the outlining. We used to set OI.ExitBlock
7950 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7951 // except when the task body is a single basic block. In that case,
7952 // OI.ExitBlock is set to the single task body block and will get left out of
7953 // the outlining process. So, simply create a new empty block to which we
7954 // uncoditionally branch from where TaskBodyCB left off
7955 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7957 /*IsFinished=*/true);
7958
7959 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7960 bool NeedsTargetTask = HasNoWait && DeviceID;
7961 if (NeedsTargetTask) {
7962 for (auto *V :
7963 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7964 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7965 RTArgs.SizesArray}) {
7966 if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
7967 OffloadingArraysToPrivatize.push_back(V);
7968 OI.ExcludeArgsFromAggregate.push_back(V);
7969 }
7970 }
7971 }
7972 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7973 DeviceID, OffloadingArraysToPrivatize](
7974 Function &OutlinedFn) mutable {
7975 assert(OutlinedFn.hasOneUse() &&
7976 "there must be a single user for the outlined function");
7977
7978 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7979
7980 // The first argument of StaleCI is always the thread id.
7981 // The next few arguments are the pointers to offloading arrays
7982 // if any. (see OffloadingArraysToPrivatize)
7983 // Finally, all other local values that are live-in into the outlined region
7984 // end up in a structure whose pointer is passed as the last argument. This
7985 // piece of data is passed in the "shared" field of the task structure. So,
7986 // we know we have to pass shareds to the task if the number of arguments is
7987 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
7988 // thread id. Further, for safety, we assert that the number of arguments of
7989 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
7990 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
7991 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
7992 assert((!HasShareds ||
7993 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
7994 "Wrong number of arguments for StaleCI when shareds are present");
7995 int SharedArgOperandNo =
7996 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
7997
7998 StructType *TaskWithPrivatesTy =
7999 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8000 StructType *PrivatesTy = nullptr;
8001
8002 if (!OffloadingArraysToPrivatize.empty())
8003 PrivatesTy =
8004 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8005
8007 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8008 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8009
8010 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8011 << "\n");
8012
8013 Builder.SetInsertPoint(StaleCI);
8014
8015 // Gather the arguments for emitting the runtime call.
8016 uint32_t SrcLocStrSize;
8017 Constant *SrcLocStr =
8019 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8020
8021 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8022 //
8023 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8024 // the DeviceID to the deferred task and also since
8025 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8026 Function *TaskAllocFn =
8027 !NeedsTargetTask
8028 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8030 OMPRTL___kmpc_omp_target_task_alloc);
8031
8032 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8033 // call.
8034 Value *ThreadID = getOrCreateThreadID(Ident);
8035
8036 // Argument - `sizeof_kmp_task_t` (TaskSize)
8037 // Tasksize refers to the size in bytes of kmp_task_t data structure
8038 // plus any other data to be passed to the target task, if any, which
8039 // is packed into a struct. kmp_task_t and the struct so created are
8040 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8041 Value *TaskSize = Builder.getInt64(
8042 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8043
8044 // Argument - `sizeof_shareds` (SharedsSize)
8045 // SharedsSize refers to the shareds array size in the kmp_task_t data
8046 // structure.
8047 Value *SharedsSize = Builder.getInt64(0);
8048 if (HasShareds) {
8049 auto *ArgStructAlloca =
8050 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8051 assert(ArgStructAlloca &&
8052 "Unable to find the alloca instruction corresponding to arguments "
8053 "for extracted function");
8054 auto *ArgStructType =
8055 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8056 assert(ArgStructType && "Unable to find struct type corresponding to "
8057 "arguments for extracted function");
8058 SharedsSize =
8060 }
8061
8062 // Argument - `flags`
8063 // Task is tied iff (Flags & 1) == 1.
8064 // Task is untied iff (Flags & 1) == 0.
8065 // Task is final iff (Flags & 2) == 2.
8066 // Task is not final iff (Flags & 2) == 0.
8067 // A target task is not final and is untied.
8068 Value *Flags = Builder.getInt32(0);
8069
8070 // Emit the @__kmpc_omp_task_alloc runtime call
8071 // The runtime call returns a pointer to an area where the task captured
8072 // variables must be copied before the task is run (TaskData)
8073 CallInst *TaskData = nullptr;
8074
8075 SmallVector<llvm::Value *> TaskAllocArgs = {
8076 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8077 /*flags=*/Flags,
8078 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8079 /*task_func=*/ProxyFn};
8080
8081 if (NeedsTargetTask) {
8082 assert(DeviceID && "Expected non-empty device ID.");
8083 TaskAllocArgs.push_back(DeviceID);
8084 }
8085
8086 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8087
8088 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8089 if (HasShareds) {
8090 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8092 *this, Builder, TaskData, TaskWithPrivatesTy);
8093 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8094 SharedsSize);
8095 }
8096 if (!OffloadingArraysToPrivatize.empty()) {
8097 Value *Privates =
8098 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8099 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8100 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8101 [[maybe_unused]] Type *ArrayType =
8102 getOffloadingArrayType(PtrToPrivatize);
8103 assert(ArrayType && "ArrayType cannot be nullptr");
8104
8105 Type *ElementType = PrivatesTy->getElementType(i);
8106 assert(ElementType == ArrayType &&
8107 "ElementType should match ArrayType");
8108 (void)ArrayType;
8109
8110 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8112 Dst, Alignment, PtrToPrivatize, Alignment,
8114 }
8115 }
8116
8117 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8118
8119 // ---------------------------------------------------------------
8120 // V5.2 13.8 target construct
8121 // If the nowait clause is present, execution of the target task
8122 // may be deferred. If the nowait clause is not present, the target task is
8123 // an included task.
8124 // ---------------------------------------------------------------
8125 // The above means that the lack of a nowait on the target construct
8126 // translates to '#pragma omp task if(0)'
8127 if (!NeedsTargetTask) {
8128 if (DepArray) {
8129 Function *TaskWaitFn =
8130 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8132 TaskWaitFn,
8133 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8134 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8135 /*dep_list=*/DepArray,
8136 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8137 /*noalias_dep_list=*/
8139 }
8140 // Included task.
8141 Function *TaskBeginFn =
8142 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8143 Function *TaskCompleteFn =
8144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8145 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8146 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8147 CI->setDebugLoc(StaleCI->getDebugLoc());
8148 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8149 } else if (DepArray) {
8150 // HasNoWait - meaning the task may be deferred. Call
8151 // __kmpc_omp_task_with_deps if there are dependencies,
8152 // else call __kmpc_omp_task
8153 Function *TaskFn =
8154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8156 TaskFn,
8157 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8158 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8160 } else {
8161 // Emit the @__kmpc_omp_task runtime call to spawn the task
8162 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8163 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8164 }
8165
8166 StaleCI->eraseFromParent();
8167 for (Instruction *I : llvm::reverse(ToBeDeleted))
8168 I->eraseFromParent();
8169 };
8170 addOutlineInfo(std::move(OI));
8171
8172 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8173 << *(Builder.GetInsertBlock()) << "\n");
8174 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8176 << "\n");
8177 return Builder.saveIP();
8178}
8179
8181 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8182 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8183 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8184 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8185 if (Error Err =
8186 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8187 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8188 return Err;
8189 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8190 return Error::success();
8191}
8192
8193static void emitTargetCall(
8194 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8199 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8204 bool HasNoWait) {
8205 // Generate a function call to the host fallback implementation of the target
8206 // region. This is called by the host when no offload entry was generated for
8207 // the target region and when the offloading call fails at runtime.
8208 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8210 Builder.restoreIP(IP);
8211 Builder.CreateCall(OutlinedFn, Args);
8212 return Builder.saveIP();
8213 };
8214
8215 bool HasDependencies = Dependencies.size() > 0;
8216 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8217
8219
8220 auto TaskBodyCB =
8221 [&](Value *DeviceID, Value *RTLoc,
8222 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8223 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8224 // produce any.
8226 // emitKernelLaunch makes the necessary runtime call to offload the
8227 // kernel. We then outline all that code into a separate function
8228 // ('kernel_launch_function' in the pseudo code above). This function is
8229 // then called by the target task proxy function (see
8230 // '@.omp_target_task_proxy_func' in the pseudo code above)
8231 // "@.omp_target_task_proxy_func' is generated by
8232 // emitTargetTaskProxyFunction.
8233 if (OutlinedFnID && DeviceID)
8234 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8235 EmitTargetCallFallbackCB, KArgs,
8236 DeviceID, RTLoc, TargetTaskAllocaIP);
8237
8238 // We only need to do the outlining if `DeviceID` is set to avoid calling
8239 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8240 // generating the `else` branch of an `if` clause.
8241 //
8242 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8243 // In this case, we execute the host implementation directly.
8244 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8245 }());
8246
8247 OMPBuilder.Builder.restoreIP(AfterIP);
8248 return Error::success();
8249 };
8250
8251 auto &&EmitTargetCallElse =
8252 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8254 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8255 // produce any.
8257 if (RequiresOuterTargetTask) {
8258 // Arguments that are intended to be directly forwarded to an
8259 // emitKernelLaunch call are pased as nullptr, since
8260 // OutlinedFnID=nullptr results in that call not being done.
8262 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8263 /*RTLoc=*/nullptr, AllocaIP,
8264 Dependencies, EmptyRTArgs, HasNoWait);
8265 }
8266 return EmitTargetCallFallbackCB(Builder.saveIP());
8267 }());
8268
8269 Builder.restoreIP(AfterIP);
8270 return Error::success();
8271 };
8272
8273 auto &&EmitTargetCallThen =
8274 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8276 Info.HasNoWait = HasNoWait;
8277 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8279 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8280 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8281 /*IsNonContiguous=*/true,
8282 /*ForEndCall=*/false))
8283 return Err;
8284
8285 SmallVector<Value *, 3> NumTeamsC;
8286 for (auto [DefaultVal, RuntimeVal] :
8287 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8288 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8289 : Builder.getInt32(DefaultVal));
8290
8291 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8292 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8293 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8294 if (Clause)
8295 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8296 /*isSigned=*/false);
8297 return Clause;
8298 };
8299 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8300 if (Clause)
8301 Result =
8302 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8303 Result, Clause)
8304 : Clause;
8305 };
8306
8307 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8308 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8309 SmallVector<Value *, 3> NumThreadsC;
8310 Value *MaxThreadsClause =
8311 RuntimeAttrs.TeamsThreadLimit.size() == 1
8312 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8313 : nullptr;
8314
8315 for (auto [TeamsVal, TargetVal] : zip_equal(
8316 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8317 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8318 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8319
8320 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8321 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8322
8323 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8324 }
8325
8326 unsigned NumTargetItems = Info.NumberOfPtrs;
8327 // TODO: Use correct device ID
8328 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8329 uint32_t SrcLocStrSize;
8330 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8331 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8332 llvm::omp::IdentFlag(0), 0);
8333
8334 Value *TripCount = RuntimeAttrs.LoopTripCount
8335 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8336 Builder.getInt64Ty(),
8337 /*isSigned=*/false)
8338 : Builder.getInt64(0);
8339
8340 // TODO: Use correct DynCGGroupMem
8341 Value *DynCGGroupMem = Builder.getInt32(0);
8342
8343 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8344 NumTeamsC, NumThreadsC,
8345 DynCGGroupMem, HasNoWait);
8346
8347 // Assume no error was returned because TaskBodyCB and
8348 // EmitTargetCallFallbackCB don't produce any.
8350 // The presence of certain clauses on the target directive require the
8351 // explicit generation of the target task.
8352 if (RequiresOuterTargetTask)
8353 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8354 Dependencies, KArgs.RTArgs,
8355 Info.HasNoWait);
8356
8357 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8358 EmitTargetCallFallbackCB, KArgs,
8359 DeviceID, RTLoc, AllocaIP);
8360 }());
8361
8362 Builder.restoreIP(AfterIP);
8363 return Error::success();
8364 };
8365
8366 // If we don't have an ID for the target region, it means an offload entry
8367 // wasn't created. In this case we just run the host fallback directly and
8368 // ignore any potential 'if' clauses.
8369 if (!OutlinedFnID) {
8370 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8371 return;
8372 }
8373
8374 // If there's no 'if' clause, only generate the kernel launch code path.
8375 if (!IfCond) {
8376 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8377 return;
8378 }
8379
8380 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8381 EmitTargetCallElse, AllocaIP));
8382}
8383
8385 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8386 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8387 TargetRegionEntryInfo &EntryInfo,
8388 const TargetKernelDefaultAttrs &DefaultAttrs,
8389 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8390 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8393 CustomMapperCallbackTy CustomMapperCB,
8394 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8395
8396 if (!updateToLocation(Loc))
8397 return InsertPointTy();
8398
8399 Builder.restoreIP(CodeGenIP);
8400
8401 Function *OutlinedFn;
8402 Constant *OutlinedFnID = nullptr;
8403 // The target region is outlined into its own function. The LLVM IR for
8404 // the target region itself is generated using the callbacks CBFunc
8405 // and ArgAccessorFuncCB
8407 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8408 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8409 return Err;
8410
8411 // If we are not on the target device, then we need to generate code
8412 // to make a remote call (offload) to the previously outlined function
8413 // that represents the target region. Do that now.
8414 if (!Config.isTargetDevice())
8415 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8416 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8417 CustomMapperCB, Dependencies, HasNowait);
8418 return Builder.saveIP();
8419}
8420
8421std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8422 StringRef FirstSeparator,
8423 StringRef Separator) {
8424 SmallString<128> Buffer;
8426 StringRef Sep = FirstSeparator;
8427 for (StringRef Part : Parts) {
8428 OS << Sep << Part;
8429 Sep = Separator;
8430 }
8431 return OS.str().str();
8432}
8433
8434std::string
8436 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8437 Config.separator());
8438}
8439
8442 unsigned AddressSpace) {
8443 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8444 if (Elem.second) {
8445 assert(Elem.second->getValueType() == Ty &&
8446 "OMP internal variable has different type than requested");
8447 } else {
8448 // TODO: investigate the appropriate linkage type used for the global
8449 // variable for possibly changing that to internal or private, or maybe
8450 // create different versions of the function for different OMP internal
8451 // variables.
8452 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8455 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8456 Constant::getNullValue(Ty), Elem.first(),
8457 /*InsertBefore=*/nullptr,
8459 const DataLayout &DL = M.getDataLayout();
8460 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8461 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8462 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8463 Elem.second = GV;
8464 }
8465
8466 return Elem.second;
8467}
8468
8469Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8470 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8471 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8472 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8473}
8474
8477 Value *Null =
8478 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
8479 Value *SizeGep =
8480 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8481 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8482 return SizePtrToInt;
8483}
8484
8487 std::string VarName) {
8488 llvm::Constant *MaptypesArrayInit =
8490 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8491 M, MaptypesArrayInit->getType(),
8492 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8493 VarName);
8494 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8495 return MaptypesArrayGlobal;
8496}
8497
8499 InsertPointTy AllocaIP,
8500 unsigned NumOperands,
8501 struct MapperAllocas &MapperAllocas) {
8502 if (!updateToLocation(Loc))
8503 return;
8504
8505 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8506 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8507 Builder.restoreIP(AllocaIP);
8508 AllocaInst *ArgsBase = Builder.CreateAlloca(
8509 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8510 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8511 ".offload_ptrs");
8512 AllocaInst *ArgSizes = Builder.CreateAlloca(
8513 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8514 updateToLocation(Loc);
8515 MapperAllocas.ArgsBase = ArgsBase;
8516 MapperAllocas.Args = Args;
8517 MapperAllocas.ArgSizes = ArgSizes;
8518}
8519
8521 Function *MapperFunc, Value *SrcLocInfo,
8522 Value *MaptypesArg, Value *MapnamesArg,
8524 int64_t DeviceID, unsigned NumOperands) {
8525 if (!updateToLocation(Loc))
8526 return;
8527
8528 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8529 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8530 Value *ArgsBaseGEP =
8532 {Builder.getInt32(0), Builder.getInt32(0)});
8533 Value *ArgsGEP =
8535 {Builder.getInt32(0), Builder.getInt32(0)});
8536 Value *ArgSizesGEP =
8538 {Builder.getInt32(0), Builder.getInt32(0)});
8539 Value *NullPtr =
8540 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8541 Builder.CreateCall(MapperFunc,
8542 {SrcLocInfo, Builder.getInt64(DeviceID),
8543 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8544 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8545}
8546
8548 TargetDataRTArgs &RTArgs,
8549 TargetDataInfo &Info,
8550 bool ForEndCall) {
8551 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8552 "expected region end call to runtime only when end call is separate");
8553 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8554 auto VoidPtrTy = UnqualPtrTy;
8555 auto VoidPtrPtrTy = UnqualPtrTy;
8556 auto Int64Ty = Type::getInt64Ty(M.getContext());
8557 auto Int64PtrTy = UnqualPtrTy;
8558
8559 if (!Info.NumberOfPtrs) {
8560 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8561 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8562 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8563 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8564 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8565 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8566 return;
8567 }
8568
8570 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8571 Info.RTArgs.BasePointersArray,
8572 /*Idx0=*/0, /*Idx1=*/0);
8574 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8575 /*Idx0=*/0,
8576 /*Idx1=*/0);
8578 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8579 /*Idx0=*/0, /*Idx1=*/0);
8581 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8582 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8583 : Info.RTArgs.MapTypesArray,
8584 /*Idx0=*/0,
8585 /*Idx1=*/0);
8586
8587 // Only emit the mapper information arrays if debug information is
8588 // requested.
8589 if (!Info.EmitDebug)
8590 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8591 else
8593 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8594 /*Idx0=*/0,
8595 /*Idx1=*/0);
8596 // If there is no user-defined mapper, set the mapper array to nullptr to
8597 // avoid an unnecessary data privatization
8598 if (!Info.HasMapper)
8599 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8600 else
8601 RTArgs.MappersArray =
8602 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8603}
8604
8606 InsertPointTy CodeGenIP,
8607 MapInfosTy &CombinedInfo,
8608 TargetDataInfo &Info) {
8610 CombinedInfo.NonContigInfo;
8611
8612 // Build an array of struct descriptor_dim and then assign it to
8613 // offload_args.
8614 //
8615 // struct descriptor_dim {
8616 // uint64_t offset;
8617 // uint64_t count;
8618 // uint64_t stride
8619 // };
8620 Type *Int64Ty = Builder.getInt64Ty();
8622 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8623 "struct.descriptor_dim");
8624
8625 enum { OffsetFD = 0, CountFD, StrideFD };
8626 // We need two index variable here since the size of "Dims" is the same as
8627 // the size of Components, however, the size of offset, count, and stride is
8628 // equal to the size of base declaration that is non-contiguous.
8629 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8630 // Skip emitting ir if dimension size is 1 since it cannot be
8631 // non-contiguous.
8632 if (NonContigInfo.Dims[I] == 1)
8633 continue;
8634 Builder.restoreIP(AllocaIP);
8635 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8636 AllocaInst *DimsAddr =
8637 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8638 Builder.restoreIP(CodeGenIP);
8639 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8640 unsigned RevIdx = EE - II - 1;
8641 Value *DimsLVal = Builder.CreateInBoundsGEP(
8642 DimsAddr->getAllocatedType(), DimsAddr,
8643 {Builder.getInt64(0), Builder.getInt64(II)});
8644 // Offset
8645 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8647 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8648 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8649 // Count
8650 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8652 NonContigInfo.Counts[L][RevIdx], CountLVal,
8653 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8654 // Stride
8655 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8657 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8658 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8659 }
8660 // args[I] = &dims
8661 Builder.restoreIP(CodeGenIP);
8663 DimsAddr, Builder.getPtrTy());
8665 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8666 Info.RTArgs.PointersArray, 0, I);
8669 ++L;
8670 }
8671}
8672
8673void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8674 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8675 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8676 BasicBlock *ExitBB, bool IsInit) {
8677 StringRef Prefix = IsInit ? ".init" : ".del";
8678
8679 // Evaluate if this is an array section.
8681 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8682 Value *IsArray =
8683 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8684 Value *DeleteBit = Builder.CreateAnd(
8685 MapType,
8687 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8688 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8689 Value *DeleteCond;
8690 Value *Cond;
8691 if (IsInit) {
8692 // base != begin?
8693 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8694 // IsPtrAndObj?
8695 Value *PtrAndObjBit = Builder.CreateAnd(
8696 MapType,
8698 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8699 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8700 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8701 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8702 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8703 DeleteCond = Builder.CreateIsNull(
8704 DeleteBit,
8705 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8706 } else {
8707 Cond = IsArray;
8708 DeleteCond = Builder.CreateIsNotNull(
8709 DeleteBit,
8710 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8711 }
8712 Cond = Builder.CreateAnd(Cond, DeleteCond);
8713 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8714
8715 emitBlock(BodyBB, MapperFn);
8716 // Get the array size by multiplying element size and element number (i.e., \p
8717 // Size).
8718 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8719 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8720 // memory allocation/deletion purpose only.
8721 Value *MapTypeArg = Builder.CreateAnd(
8722 MapType,
8724 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8725 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8726 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8727 MapTypeArg = Builder.CreateOr(
8728 MapTypeArg,
8730 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8731 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8732
8733 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8734 // data structure.
8735 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8736 ArraySize, MapTypeArg, MapName};
8738 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8739 OffloadingArgs);
8740}
8741
8744 llvm::Value *BeginArg)>
8745 GenMapInfoCB,
8746 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8747 SmallVector<Type *> Params;
8748 Params.emplace_back(Builder.getPtrTy());
8749 Params.emplace_back(Builder.getPtrTy());
8750 Params.emplace_back(Builder.getPtrTy());
8753 Params.emplace_back(Builder.getPtrTy());
8754
8755 auto *FnTy =
8756 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8757
8758 SmallString<64> TyStr;
8759 raw_svector_ostream Out(TyStr);
8760 Function *MapperFn =
8762 MapperFn->addFnAttr(Attribute::NoInline);
8763 MapperFn->addFnAttr(Attribute::NoUnwind);
8764 MapperFn->addParamAttr(0, Attribute::NoUndef);
8765 MapperFn->addParamAttr(1, Attribute::NoUndef);
8766 MapperFn->addParamAttr(2, Attribute::NoUndef);
8767 MapperFn->addParamAttr(3, Attribute::NoUndef);
8768 MapperFn->addParamAttr(4, Attribute::NoUndef);
8769 MapperFn->addParamAttr(5, Attribute::NoUndef);
8770
8771 // Start the mapper function code generation.
8772 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8773 auto SavedIP = Builder.saveIP();
8774 Builder.SetInsertPoint(EntryBB);
8775
8776 Value *MapperHandle = MapperFn->getArg(0);
8777 Value *BaseIn = MapperFn->getArg(1);
8778 Value *BeginIn = MapperFn->getArg(2);
8779 Value *Size = MapperFn->getArg(3);
8780 Value *MapType = MapperFn->getArg(4);
8781 Value *MapName = MapperFn->getArg(5);
8782
8783 // Compute the starting and end addresses of array elements.
8784 // Prepare common arguments for array initiation and deletion.
8785 // Convert the size in bytes into the number of array elements.
8786 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8788 Value *PtrBegin = BeginIn;
8789 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8790
8791 // Emit array initiation if this is an array section and \p MapType indicates
8792 // that memory allocation is required.
8793 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8794 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8795 MapType, MapName, ElementSize, HeadBB,
8796 /*IsInit=*/true);
8797
8798 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8799
8800 // Emit the loop header block.
8801 emitBlock(HeadBB, MapperFn);
8802 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8803 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8804 // Evaluate whether the initial condition is satisfied.
8805 Value *IsEmpty =
8806 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8807 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8808
8809 // Emit the loop body block.
8810 emitBlock(BodyBB, MapperFn);
8811 BasicBlock *LastBB = BodyBB;
8812 PHINode *PtrPHI =
8813 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8814 PtrPHI->addIncoming(PtrBegin, HeadBB);
8815
8816 // Get map clause information. Fill up the arrays with all mapped variables.
8817 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8818 if (!Info)
8819 return Info.takeError();
8820
8821 // Call the runtime API __tgt_mapper_num_components to get the number of
8822 // pre-existing components.
8823 Value *OffloadingArgs[] = {MapperHandle};
8824 Value *PreviousSize = Builder.CreateCall(
8825 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8826 OffloadingArgs);
8827 Value *ShiftedPreviousSize =
8829
8830 // Fill up the runtime mapper handle for all components.
8831 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8832 Value *CurBaseArg = Info->BasePointers[I];
8833 Value *CurBeginArg = Info->Pointers[I];
8834 Value *CurSizeArg = Info->Sizes[I];
8835 Value *CurNameArg = Info->Names.size()
8836 ? Info->Names[I]
8838
8839 // Extract the MEMBER_OF field from the map type.
8840 Value *OriMapType = Builder.getInt64(
8841 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8842 Info->Types[I]));
8843 Value *MemberMapType =
8844 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8845
8846 // Combine the map type inherited from user-defined mapper with that
8847 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8848 // bits of the \a MapType, which is the input argument of the mapper
8849 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8850 // bits of MemberMapType.
8851 // [OpenMP 5.0], 1.2.6. map-type decay.
8852 // | alloc | to | from | tofrom | release | delete
8853 // ----------------------------------------------------------
8854 // alloc | alloc | alloc | alloc | alloc | release | delete
8855 // to | alloc | to | alloc | to | release | delete
8856 // from | alloc | alloc | from | from | release | delete
8857 // tofrom | alloc | to | from | tofrom | release | delete
8858 Value *LeftToFrom = Builder.CreateAnd(
8859 MapType,
8861 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8862 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8863 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8864 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8865 BasicBlock *AllocElseBB =
8866 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8867 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8868 BasicBlock *ToElseBB =
8869 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8870 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8871 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8872 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8873 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8874 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8875 emitBlock(AllocBB, MapperFn);
8876 Value *AllocMapType = Builder.CreateAnd(
8877 MemberMapType,
8879 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8880 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8881 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8882 Builder.CreateBr(EndBB);
8883 emitBlock(AllocElseBB, MapperFn);
8884 Value *IsTo = Builder.CreateICmpEQ(
8885 LeftToFrom,
8887 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8888 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8889 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8890 // In case of to, clear OMP_MAP_FROM.
8891 emitBlock(ToBB, MapperFn);
8892 Value *ToMapType = Builder.CreateAnd(
8893 MemberMapType,
8895 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8896 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8897 Builder.CreateBr(EndBB);
8898 emitBlock(ToElseBB, MapperFn);
8899 Value *IsFrom = Builder.CreateICmpEQ(
8900 LeftToFrom,
8902 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8903 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8904 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8905 // In case of from, clear OMP_MAP_TO.
8906 emitBlock(FromBB, MapperFn);
8907 Value *FromMapType = Builder.CreateAnd(
8908 MemberMapType,
8910 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8911 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8912 // In case of tofrom, do nothing.
8913 emitBlock(EndBB, MapperFn);
8914 LastBB = EndBB;
8915 PHINode *CurMapType =
8916 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8917 CurMapType->addIncoming(AllocMapType, AllocBB);
8918 CurMapType->addIncoming(ToMapType, ToBB);
8919 CurMapType->addIncoming(FromMapType, FromBB);
8920 CurMapType->addIncoming(MemberMapType, ToElseBB);
8921
8922 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8923 CurSizeArg, CurMapType, CurNameArg};
8924
8925 auto ChildMapperFn = CustomMapperCB(I);
8926 if (!ChildMapperFn)
8927 return ChildMapperFn.takeError();
8928 if (*ChildMapperFn) {
8929 // Call the corresponding mapper function.
8930 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8931 } else {
8932 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8933 // data structure.
8935 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8936 OffloadingArgs);
8937 }
8938 }
8939
8940 // Update the pointer to point to the next element that needs to be mapped,
8941 // and check whether we have mapped all elements.
8942 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8943 "omp.arraymap.next");
8944 PtrPHI->addIncoming(PtrNext, LastBB);
8945 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8946 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8947 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8948
8949 emitBlock(ExitBB, MapperFn);
8950 // Emit array deletion if this is an array section and \p MapType indicates
8951 // that deletion is required.
8952 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8953 MapType, MapName, ElementSize, DoneBB,
8954 /*IsInit=*/false);
8955
8956 // Emit the function exit block.
8957 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8958
8960 Builder.restoreIP(SavedIP);
8961 return MapperFn;
8962}
8963
8965 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8966 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8967 bool IsNonContiguous,
8968 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8969
8970 // Reset the array information.
8971 Info.clearArrayInfo();
8972 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8973
8974 if (Info.NumberOfPtrs == 0)
8975 return Error::success();
8976
8977 Builder.restoreIP(AllocaIP);
8978 // Detect if we have any capture size requiring runtime evaluation of the
8979 // size so that a constant array could be eventually used.
8980 ArrayType *PointerArrayType =
8981 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8982
8983 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8984 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8985
8986 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8987 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8988 AllocaInst *MappersArray = Builder.CreateAlloca(
8989 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8990 Info.RTArgs.MappersArray = MappersArray;
8991
8992 // If we don't have any VLA types or other types that require runtime
8993 // evaluation, we can use a constant array for the map sizes, otherwise we
8994 // need to fill up the arrays as we do for the pointers.
8995 Type *Int64Ty = Builder.getInt64Ty();
8996 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8997 ConstantInt::get(Int64Ty, 0));
8998 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8999 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9000 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9001 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9002 if (IsNonContiguous &&
9003 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9004 CombinedInfo.Types[I] &
9005 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9006 ConstSizes[I] =
9007 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9008 else
9009 ConstSizes[I] = CI;
9010 continue;
9011 }
9012 }
9013 RuntimeSizes.set(I);
9014 }
9015
9016 if (RuntimeSizes.all()) {
9017 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9018 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9019 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9020 restoreIPandDebugLoc(Builder, CodeGenIP);
9021 } else {
9022 auto *SizesArrayInit = ConstantArray::get(
9023 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9024 std::string Name = createPlatformSpecificName({"offload_sizes"});
9025 auto *SizesArrayGbl =
9026 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9027 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9028 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9029
9030 if (!RuntimeSizes.any()) {
9031 Info.RTArgs.SizesArray = SizesArrayGbl;
9032 } else {
9033 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9034 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9035 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9037 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9038 Buffer->setAlignment(OffloadSizeAlign);
9039 restoreIPandDebugLoc(Builder, CodeGenIP);
9041 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9042 SizesArrayGbl, OffloadSizeAlign,
9044 IndexSize,
9045 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9046
9047 Info.RTArgs.SizesArray = Buffer;
9048 }
9049 restoreIPandDebugLoc(Builder, CodeGenIP);
9050 }
9051
9052 // The map types are always constant so we don't need to generate code to
9053 // fill arrays. Instead, we create an array constant.
9055 for (auto mapFlag : CombinedInfo.Types)
9056 Mapping.push_back(
9057 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9058 mapFlag));
9059 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9060 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9061 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9062
9063 // The information types are only built if provided.
9064 if (!CombinedInfo.Names.empty()) {
9065 auto *MapNamesArrayGbl = createOffloadMapnames(
9066 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9067 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9068 Info.EmitDebug = true;
9069 } else {
9070 Info.RTArgs.MapNamesArray =
9072 Info.EmitDebug = false;
9073 }
9074
9075 // If there's a present map type modifier, it must not be applied to the end
9076 // of a region, so generate a separate map type array in that case.
9077 if (Info.separateBeginEndCalls()) {
9078 bool EndMapTypesDiffer = false;
9079 for (uint64_t &Type : Mapping) {
9080 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9081 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9082 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9083 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9084 EndMapTypesDiffer = true;
9085 }
9086 }
9087 if (EndMapTypesDiffer) {
9088 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9089 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9090 }
9091 }
9092
9093 PointerType *PtrTy = Builder.getPtrTy();
9094 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9095 Value *BPVal = CombinedInfo.BasePointers[I];
9097 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9098 0, I);
9099 Builder.CreateAlignedStore(BPVal, BP,
9101
9102 if (Info.requiresDevicePointerInfo()) {
9103 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9104 CodeGenIP = Builder.saveIP();
9105 Builder.restoreIP(AllocaIP);
9106 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9107 Builder.restoreIP(CodeGenIP);
9108 if (DeviceAddrCB)
9109 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9110 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9111 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9112 if (DeviceAddrCB)
9113 DeviceAddrCB(I, BP);
9114 }
9115 }
9116
9117 Value *PVal = CombinedInfo.Pointers[I];
9119 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9120 I);
9121 // TODO: Check alignment correct.
9124
9125 if (RuntimeSizes.test(I)) {
9127 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9128 /*Idx0=*/0,
9129 /*Idx1=*/I);
9131 Int64Ty,
9132 /*isSigned=*/true),
9133 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9134 }
9135 // Fill up the mapper array.
9136 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9137 Value *MFunc = ConstantPointerNull::get(PtrTy);
9138
9139 auto CustomMFunc = CustomMapperCB(I);
9140 if (!CustomMFunc)
9141 return CustomMFunc.takeError();
9142 if (*CustomMFunc)
9143 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9144
9146 MappersArray->getAllocatedType(), MappersArray,
9147 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9149 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9150 }
9151
9152 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9153 Info.NumberOfPtrs == 0)
9154 return Error::success();
9155 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9156 return Error::success();
9157}
9158
9161
9162 if (!CurBB || CurBB->getTerminator()) {
9163 // If there is no insert point or the previous block is already
9164 // terminated, don't touch it.
9165 } else {
9166 // Otherwise, create a fall-through branch.
9168 }
9169
9171}
9172
9174 bool IsFinished) {
9176
9177 // Fall out of the current block (if necessary).
9178 emitBranch(BB);
9179
9180 if (IsFinished && BB->use_empty()) {
9181 BB->eraseFromParent();
9182 return;
9183 }
9184
9185 // Place the block after the current block, if possible, or else at
9186 // the end of the function.
9187 if (CurBB && CurBB->getParent())
9188 CurFn->insert(std::next(CurBB->getIterator()), BB);
9189 else
9190 CurFn->insert(CurFn->end(), BB);
9192}
9193
9195 BodyGenCallbackTy ElseGen,
9196 InsertPointTy AllocaIP) {
9197 // If the condition constant folds and can be elided, try to avoid emitting
9198 // the condition and the dead arm of the if/else.
9199 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9200 auto CondConstant = CI->getSExtValue();
9201 if (CondConstant)
9202 return ThenGen(AllocaIP, Builder.saveIP());
9203
9204 return ElseGen(AllocaIP, Builder.saveIP());
9205 }
9206
9208
9209 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9210 // emit the conditional branch.
9211 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9212 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9213 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9214 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9215 // Emit the 'then' code.
9216 emitBlock(ThenBlock, CurFn);
9217 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9218 return Err;
9219 emitBranch(ContBlock);
9220 // Emit the 'else' code if present.
9221 // There is no need to emit line number for unconditional branch.
9222 emitBlock(ElseBlock, CurFn);
9223 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9224 return Err;
9225 // There is no need to emit line number for unconditional branch.
9226 emitBranch(ContBlock);
9227 // Emit the continuation block for code after the if.
9228 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9229 return Error::success();
9230}
9231
9232bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9233 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9236 "Unexpected Atomic Ordering.");
9237
9238 bool Flush = false;
9240
9241 switch (AK) {
9242 case Read:
9245 FlushAO = AtomicOrdering::Acquire;
9246 Flush = true;
9247 }
9248 break;
9249 case Write:
9250 case Compare:
9251 case Update:
9254 FlushAO = AtomicOrdering::Release;
9255 Flush = true;
9256 }
9257 break;
9258 case Capture:
9259 switch (AO) {
9261 FlushAO = AtomicOrdering::Acquire;
9262 Flush = true;
9263 break;
9265 FlushAO = AtomicOrdering::Release;
9266 Flush = true;
9267 break;
9271 Flush = true;
9272 break;
9273 default:
9274 // do nothing - leave silently.
9275 break;
9276 }
9277 }
9278
9279 if (Flush) {
9280 // Currently Flush RT call still doesn't take memory_ordering, so for when
9281 // that happens, this tries to do the resolution of which atomic ordering
9282 // to use with but issue the flush call
9283 // TODO: pass `FlushAO` after memory ordering support is added
9284 (void)FlushAO;
9285 emitFlush(Loc);
9286 }
9287
9288 // for AO == AtomicOrdering::Monotonic and all other case combinations
9289 // do nothing
9290 return Flush;
9291}
9292
9296 AtomicOrdering AO, InsertPointTy AllocaIP) {
9297 if (!updateToLocation(Loc))
9298 return Loc.IP;
9299
9300 assert(X.Var->getType()->isPointerTy() &&
9301 "OMP Atomic expects a pointer to target memory");
9302 Type *XElemTy = X.ElemTy;
9303 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9304 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9305 "OMP atomic read expected a scalar type");
9306
9307 Value *XRead = nullptr;
9308
9309 if (XElemTy->isIntegerTy()) {
9310 LoadInst *XLD =
9311 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9312 XLD->setAtomic(AO);
9313 XRead = cast<Value>(XLD);
9314 } else if (XElemTy->isStructTy()) {
9315 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9316 // target does not support `atomicrmw` of the size of the struct
9317 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9318 OldVal->setAtomic(AO);
9319 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9320 unsigned LoadSize =
9321 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9322 OpenMPIRBuilder::AtomicInfo atomicInfo(
9323 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9324 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9325 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9326 XRead = AtomicLoadRes.first;
9327 OldVal->eraseFromParent();
9328 } else {
9329 // We need to perform atomic op as integer
9330 IntegerType *IntCastTy =
9332 LoadInst *XLoad =
9333 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9334 XLoad->setAtomic(AO);
9335 if (XElemTy->isFloatingPointTy()) {
9336 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9337 } else {
9338 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9339 }
9340 }
9341 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9342 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9343 return Builder.saveIP();
9344}
9345
9348 AtomicOpValue &X, Value *Expr,
9349 AtomicOrdering AO, InsertPointTy AllocaIP) {
9350 if (!updateToLocation(Loc))
9351 return Loc.IP;
9352
9353 assert(X.Var->getType()->isPointerTy() &&
9354 "OMP Atomic expects a pointer to target memory");
9355 Type *XElemTy = X.ElemTy;
9356 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9357 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9358 "OMP atomic write expected a scalar type");
9359
9360 if (XElemTy->isIntegerTy()) {
9361 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9362 XSt->setAtomic(AO);
9363 } else if (XElemTy->isStructTy()) {
9364 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9365 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9366 unsigned LoadSize =
9367 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9368 OpenMPIRBuilder::AtomicInfo atomicInfo(
9369 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9370 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9371 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9372 OldVal->eraseFromParent();
9373 } else {
9374 // We need to bitcast and perform atomic op as integers
9375 IntegerType *IntCastTy =
9377 Value *ExprCast =
9378 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9379 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9380 XSt->setAtomic(AO);
9381 }
9382
9383 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9384 return Builder.saveIP();
9385}
9386
9388 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9389 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9390 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9391 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9392 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9393 if (!updateToLocation(Loc))
9394 return Loc.IP;
9395
9396 LLVM_DEBUG({
9397 Type *XTy = X.Var->getType();
9398 assert(XTy->isPointerTy() &&
9399 "OMP Atomic expects a pointer to target memory");
9400 Type *XElemTy = X.ElemTy;
9401 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9402 XElemTy->isPointerTy()) &&
9403 "OMP atomic update expected a scalar type");
9404 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9405 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9406 "OpenMP atomic does not support LT or GT operations");
9407 });
9408
9409 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9410 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9411 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9412 if (!AtomicResult)
9413 return AtomicResult.takeError();
9414 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9415 return Builder.saveIP();
9416}
9417
9418// FIXME: Duplicating AtomicExpand
9419Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9420 AtomicRMWInst::BinOp RMWOp) {
9421 switch (RMWOp) {
9422 case AtomicRMWInst::Add:
9423 return Builder.CreateAdd(Src1, Src2);
9424 case AtomicRMWInst::Sub:
9425 return Builder.CreateSub(Src1, Src2);
9426 case AtomicRMWInst::And:
9427 return Builder.CreateAnd(Src1, Src2);
9429 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9430 case AtomicRMWInst::Or:
9431 return Builder.CreateOr(Src1, Src2);
9432 case AtomicRMWInst::Xor:
9433 return Builder.CreateXor(Src1, Src2);
9438 case AtomicRMWInst::Max:
9439 case AtomicRMWInst::Min:
9450 llvm_unreachable("Unsupported atomic update operation");
9451 }
9452 llvm_unreachable("Unsupported atomic update operation");
9453}
9454
9455Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9456 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9458 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9459 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9460 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9461 // or a complex datatype.
9462 bool emitRMWOp = false;
9463 switch (RMWOp) {
9464 case AtomicRMWInst::Add:
9465 case AtomicRMWInst::And:
9467 case AtomicRMWInst::Or:
9468 case AtomicRMWInst::Xor:
9470 emitRMWOp = XElemTy;
9471 break;
9472 case AtomicRMWInst::Sub:
9473 emitRMWOp = (IsXBinopExpr && XElemTy);
9474 break;
9475 default:
9476 emitRMWOp = false;
9477 }
9478 emitRMWOp &= XElemTy->isIntegerTy();
9479
9480 std::pair<Value *, Value *> Res;
9481 if (emitRMWOp) {
9482 AtomicRMWInst *RMWInst =
9483 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9484 if (T.isAMDGPU()) {
9485 if (IsIgnoreDenormalMode)
9486 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9488 if (!IsFineGrainedMemory)
9489 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9491 if (!IsRemoteMemory)
9492 RMWInst->setMetadata("amdgpu.no.remote.memory",
9494 }
9495 Res.first = RMWInst;
9496 // not needed except in case of postfix captures. Generate anyway for
9497 // consistency with the else part. Will be removed with any DCE pass.
9498 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9499 if (RMWOp == AtomicRMWInst::Xchg)
9500 Res.second = Res.first;
9501 else
9502 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9503 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9504 XElemTy->isStructTy()) {
9505 LoadInst *OldVal =
9506 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9507 OldVal->setAtomic(AO);
9508 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9509 unsigned LoadSize =
9510 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9511
9512 OpenMPIRBuilder::AtomicInfo atomicInfo(
9513 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9514 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9515 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9517 Instruction *CurBBTI = CurBB->getTerminator();
9518 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9519 BasicBlock *ExitBB =
9520 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9521 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9522 X->getName() + ".atomic.cont");
9523 ContBB->getTerminator()->eraseFromParent();
9524 Builder.restoreIP(AllocaIP);
9525 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9526 NewAtomicAddr->setName(X->getName() + "x.new.val");
9527 Builder.SetInsertPoint(ContBB);
9528 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9529 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9530 Value *OldExprVal = PHI;
9531 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9532 if (!CBResult)
9533 return CBResult.takeError();
9534 Value *Upd = *CBResult;
9535 Builder.CreateStore(Upd, NewAtomicAddr);
9538 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9539 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9540 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9541 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9542 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9543 OldVal->eraseFromParent();
9544 Res.first = OldExprVal;
9545 Res.second = Upd;
9546
9547 if (UnreachableInst *ExitTI =
9548 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9549 CurBBTI->eraseFromParent();
9550 Builder.SetInsertPoint(ExitBB);
9551 } else {
9552 Builder.SetInsertPoint(ExitTI);
9553 }
9554 } else {
9555 IntegerType *IntCastTy =
9557 LoadInst *OldVal =
9558 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9559 OldVal->setAtomic(AO);
9560 // CurBB
9561 // | /---\
9562 // ContBB |
9563 // | \---/
9564 // ExitBB
9566 Instruction *CurBBTI = CurBB->getTerminator();
9567 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9568 BasicBlock *ExitBB =
9569 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9570 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9571 X->getName() + ".atomic.cont");
9572 ContBB->getTerminator()->eraseFromParent();
9573 Builder.restoreIP(AllocaIP);
9574 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9575 NewAtomicAddr->setName(X->getName() + "x.new.val");
9576 Builder.SetInsertPoint(ContBB);
9577 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9578 PHI->addIncoming(OldVal, CurBB);
9579 bool IsIntTy = XElemTy->isIntegerTy();
9580 Value *OldExprVal = PHI;
9581 if (!IsIntTy) {
9582 if (XElemTy->isFloatingPointTy()) {
9583 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9584 X->getName() + ".atomic.fltCast");
9585 } else {
9586 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9587 X->getName() + ".atomic.ptrCast");
9588 }
9589 }
9590
9591 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9592 if (!CBResult)
9593 return CBResult.takeError();
9594 Value *Upd = *CBResult;
9595 Builder.CreateStore(Upd, NewAtomicAddr);
9596 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9600 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9601 Result->setVolatile(VolatileX);
9602 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9603 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9604 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9605 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9606
9607 Res.first = OldExprVal;
9608 Res.second = Upd;
9609
9610 // set Insertion point in exit block
9611 if (UnreachableInst *ExitTI =
9612 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9613 CurBBTI->eraseFromParent();
9614 Builder.SetInsertPoint(ExitBB);
9615 } else {
9616 Builder.SetInsertPoint(ExitTI);
9617 }
9618 }
9619
9620 return Res;
9621}
9622
9624 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9625 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9627 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9628 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9629 if (!updateToLocation(Loc))
9630 return Loc.IP;
9631
9632 LLVM_DEBUG({
9633 Type *XTy = X.Var->getType();
9634 assert(XTy->isPointerTy() &&
9635 "OMP Atomic expects a pointer to target memory");
9636 Type *XElemTy = X.ElemTy;
9637 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9638 XElemTy->isPointerTy()) &&
9639 "OMP atomic capture expected a scalar type");
9640 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9641 "OpenMP atomic does not support LT or GT operations");
9642 });
9643
9644 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9645 // 'x' is simply atomically rewritten with 'expr'.
9646 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9647 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9648 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9649 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9650 if (!AtomicResult)
9651 return AtomicResult.takeError();
9652 Value *CapturedVal =
9653 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9654 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9655
9656 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9657 return Builder.saveIP();
9658}
9659
9663 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9664 bool IsFailOnly) {
9665
9667 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9668 IsPostfixUpdate, IsFailOnly, Failure);
9669}
9670
9674 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9675 bool IsFailOnly, AtomicOrdering Failure) {
9676
9677 if (!updateToLocation(Loc))
9678 return Loc.IP;
9679
9680 assert(X.Var->getType()->isPointerTy() &&
9681 "OMP atomic expects a pointer to target memory");
9682 // compare capture
9683 if (V.Var) {
9684 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9685 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9686 }
9687
9688 bool IsInteger = E->getType()->isIntegerTy();
9689
9690 if (Op == OMPAtomicCompareOp::EQ) {
9691 AtomicCmpXchgInst *Result = nullptr;
9692 if (!IsInteger) {
9693 IntegerType *IntCastTy =
9694 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9695 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9696 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9697 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9698 AO, Failure);
9699 } else {
9700 Result =
9701 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9702 }
9703
9704 if (V.Var) {
9705 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9706 if (!IsInteger)
9707 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9708 assert(OldValue->getType() == V.ElemTy &&
9709 "OldValue and V must be of same type");
9710 if (IsPostfixUpdate) {
9711 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9712 } else {
9713 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9714 if (IsFailOnly) {
9715 // CurBB----
9716 // | |
9717 // v |
9718 // ContBB |
9719 // | |
9720 // v |
9721 // ExitBB <-
9722 //
9723 // where ContBB only contains the store of old value to 'v'.
9725 Instruction *CurBBTI = CurBB->getTerminator();
9726 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9727 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9728 CurBBTI, X.Var->getName() + ".atomic.exit");
9729 BasicBlock *ContBB = CurBB->splitBasicBlock(
9730 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9731 ContBB->getTerminator()->eraseFromParent();
9732 CurBB->getTerminator()->eraseFromParent();
9733
9734 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9735
9736 Builder.SetInsertPoint(ContBB);
9737 Builder.CreateStore(OldValue, V.Var);
9738 Builder.CreateBr(ExitBB);
9739
9740 if (UnreachableInst *ExitTI =
9741 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
9742 CurBBTI->eraseFromParent();
9743 Builder.SetInsertPoint(ExitBB);
9744 } else {
9745 Builder.SetInsertPoint(ExitTI);
9746 }
9747 } else {
9748 Value *CapturedValue =
9749 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9750 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9751 }
9752 }
9753 }
9754 // The comparison result has to be stored.
9755 if (R.Var) {
9756 assert(R.Var->getType()->isPointerTy() &&
9757 "r.var must be of pointer type");
9758 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9759
9760 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9761 Value *ResultCast = R.IsSigned
9762 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9763 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9764 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9765 }
9766 } else {
9767 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9768 "Op should be either max or min at this point");
9769 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9770
9771 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9772 // Let's take max as example.
9773 // OpenMP form:
9774 // x = x > expr ? expr : x;
9775 // LLVM form:
9776 // *ptr = *ptr > val ? *ptr : val;
9777 // We need to transform to LLVM form.
9778 // x = x <= expr ? x : expr;
9780 if (IsXBinopExpr) {
9781 if (IsInteger) {
9782 if (X.IsSigned)
9783 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9785 else
9786 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9788 } else {
9789 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9791 }
9792 } else {
9793 if (IsInteger) {
9794 if (X.IsSigned)
9795 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9797 else
9798 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9800 } else {
9801 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9803 }
9804 }
9805
9806 AtomicRMWInst *OldValue =
9807 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9808 if (V.Var) {
9809 Value *CapturedValue = nullptr;
9810 if (IsPostfixUpdate) {
9811 CapturedValue = OldValue;
9812 } else {
9813 CmpInst::Predicate Pred;
9814 switch (NewOp) {
9815 case AtomicRMWInst::Max:
9816 Pred = CmpInst::ICMP_SGT;
9817 break;
9819 Pred = CmpInst::ICMP_UGT;
9820 break;
9822 Pred = CmpInst::FCMP_OGT;
9823 break;
9824 case AtomicRMWInst::Min:
9825 Pred = CmpInst::ICMP_SLT;
9826 break;
9828 Pred = CmpInst::ICMP_ULT;
9829 break;
9831 Pred = CmpInst::FCMP_OLT;
9832 break;
9833 default:
9834 llvm_unreachable("unexpected comparison op");
9835 }
9836 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9837 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9838 }
9839 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9840 }
9841 }
9842
9843 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9844
9845 return Builder.saveIP();
9846}
9847
9850 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9851 Value *NumTeamsUpper, Value *ThreadLimit,
9852 Value *IfExpr) {
9853 if (!updateToLocation(Loc))
9854 return InsertPointTy();
9855
9856 uint32_t SrcLocStrSize;
9857 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9858 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9859 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9860
9861 // Outer allocation basicblock is the entry block of the current function.
9862 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9863 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9864 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9865 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9866 }
9867
9868 // The current basic block is split into four basic blocks. After outlining,
9869 // they will be mapped as follows:
9870 // ```
9871 // def current_fn() {
9872 // current_basic_block:
9873 // br label %teams.exit
9874 // teams.exit:
9875 // ; instructions after teams
9876 // }
9877 //
9878 // def outlined_fn() {
9879 // teams.alloca:
9880 // br label %teams.body
9881 // teams.body:
9882 // ; instructions within teams body
9883 // }
9884 // ```
9885 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9886 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9887 BasicBlock *AllocaBB =
9888 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9889
9890 bool SubClausesPresent =
9891 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9892 // Push num_teams
9893 if (!Config.isTargetDevice() && SubClausesPresent) {
9894 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9895 "if lowerbound is non-null, then upperbound must also be non-null "
9896 "for bounds on num_teams");
9897
9898 if (NumTeamsUpper == nullptr)
9899 NumTeamsUpper = Builder.getInt32(0);
9900
9901 if (NumTeamsLower == nullptr)
9902 NumTeamsLower = NumTeamsUpper;
9903
9904 if (IfExpr) {
9905 assert(IfExpr->getType()->isIntegerTy() &&
9906 "argument to if clause must be an integer value");
9907
9908 // upper = ifexpr ? upper : 1
9909 if (IfExpr->getType() != Int1)
9910 IfExpr = Builder.CreateICmpNE(IfExpr,
9911 ConstantInt::get(IfExpr->getType(), 0));
9912 NumTeamsUpper = Builder.CreateSelect(
9913 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9914
9915 // lower = ifexpr ? lower : 1
9916 NumTeamsLower = Builder.CreateSelect(
9917 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9918 }
9919
9920 if (ThreadLimit == nullptr)
9921 ThreadLimit = Builder.getInt32(0);
9922
9923 Value *ThreadNum = getOrCreateThreadID(Ident);
9925 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9926 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9927 }
9928 // Generate the body of teams.
9929 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9930 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9931 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9932 return Err;
9933
9934 OutlineInfo OI;
9935 OI.EntryBB = AllocaBB;
9936 OI.ExitBB = ExitBB;
9937 OI.OuterAllocaBB = &OuterAllocaBB;
9938
9939 // Insert fake values for global tid and bound tid.
9941 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9943 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9945 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9946
9947 auto HostPostOutlineCB = [this, Ident,
9948 ToBeDeleted](Function &OutlinedFn) mutable {
9949 // The stale call instruction will be replaced with a new call instruction
9950 // for runtime call with the outlined function.
9951
9952 assert(OutlinedFn.hasOneUse() &&
9953 "there must be a single user for the outlined function");
9954 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9955 ToBeDeleted.push_back(StaleCI);
9956
9957 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9958 "Outlined function must have two or three arguments only");
9959
9960 bool HasShared = OutlinedFn.arg_size() == 3;
9961
9962 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9963 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9964 if (HasShared)
9965 OutlinedFn.getArg(2)->setName("data");
9966
9967 // Call to the runtime function for teams in the current function.
9968 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9969 "outlined function.");
9970 Builder.SetInsertPoint(StaleCI);
9971 SmallVector<Value *> Args = {
9972 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9973 if (HasShared)
9974 Args.push_back(StaleCI->getArgOperand(2));
9976 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9977 Args);
9978
9979 for (Instruction *I : llvm::reverse(ToBeDeleted))
9980 I->eraseFromParent();
9981 };
9982
9983 if (!Config.isTargetDevice())
9984 OI.PostOutlineCB = HostPostOutlineCB;
9985
9986 addOutlineInfo(std::move(OI));
9987
9988 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9989
9990 return Builder.saveIP();
9991}
9992
9995 InsertPointTy OuterAllocaIP,
9996 BodyGenCallbackTy BodyGenCB) {
9997 if (!updateToLocation(Loc))
9998 return InsertPointTy();
9999
10000 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10001
10002 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10003 BasicBlock *BodyBB =
10004 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10005 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10006 }
10007 BasicBlock *ExitBB =
10008 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10009 BasicBlock *BodyBB =
10010 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10011 BasicBlock *AllocaBB =
10012 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10013
10014 // Generate the body of distribute clause
10015 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10016 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10017 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10018 return Err;
10019
10020 OutlineInfo OI;
10021 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10022 OI.EntryBB = AllocaBB;
10023 OI.ExitBB = ExitBB;
10024
10025 addOutlineInfo(std::move(OI));
10026 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10027
10028 return Builder.saveIP();
10029}
10030
10033 std::string VarName) {
10034 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10036 Names.size()),
10037 Names);
10038 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10039 M, MapNamesArrayInit->getType(),
10040 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10041 VarName);
10042 return MapNamesArrayGlobal;
10043}
10044
10045// Create all simple and struct types exposed by the runtime and remember
10046// the llvm::PointerTypes of them for easy access later.
10047void OpenMPIRBuilder::initializeTypes(Module &M) {
10048 LLVMContext &Ctx = M.getContext();
10049 StructType *T;
10050#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10051#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10052 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10053 VarName##PtrTy = PointerType::getUnqual(Ctx);
10054#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10055 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10056 VarName##Ptr = PointerType::getUnqual(Ctx);
10057#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10058 T = StructType::getTypeByName(Ctx, StructName); \
10059 if (!T) \
10060 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10061 VarName = T; \
10062 VarName##Ptr = PointerType::getUnqual(Ctx);
10063#include "llvm/Frontend/OpenMP/OMPKinds.def"
10064}
10065
10068 SmallVectorImpl<BasicBlock *> &BlockVector) {
10071 BlockSet.insert(ExitBB);
10072
10073 Worklist.push_back(EntryBB);
10074 while (!Worklist.empty()) {
10075 BasicBlock *BB = Worklist.pop_back_val();
10076 BlockVector.push_back(BB);
10077 for (BasicBlock *SuccBB : successors(BB))
10078 if (BlockSet.insert(SuccBB).second)
10079 Worklist.push_back(SuccBB);
10080 }
10081}
10082
10084 uint64_t Size, int32_t Flags,
10086 StringRef Name) {
10087 if (!Config.isGPU()) {
10090 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10091 return;
10092 }
10093 // TODO: Add support for global variables on the device after declare target
10094 // support.
10095 Function *Fn = dyn_cast<Function>(Addr);
10096 if (!Fn)
10097 return;
10098
10099 // Add a function attribute for the kernel.
10100 Fn->addFnAttr("kernel");
10101 if (T.isAMDGCN())
10102 Fn->addFnAttr("uniform-work-group-size", "true");
10103 Fn->addFnAttr(Attribute::MustProgress);
10104}
10105
10106// We only generate metadata for function that contain target regions.
10109
10110 // If there are no entries, we don't need to do anything.
10112 return;
10113
10117 16>
10118 OrderedEntries(OffloadInfoManager.size());
10119
10120 // Auxiliary methods to create metadata values and strings.
10121 auto &&GetMDInt = [this](unsigned V) {
10122 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10123 };
10124
10125 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10126
10127 // Create the offloading info metadata node.
10128 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10129 auto &&TargetRegionMetadataEmitter =
10130 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10131 const TargetRegionEntryInfo &EntryInfo,
10133 // Generate metadata for target regions. Each entry of this metadata
10134 // contains:
10135 // - Entry 0 -> Kind of this type of metadata (0).
10136 // - Entry 1 -> Device ID of the file where the entry was identified.
10137 // - Entry 2 -> File ID of the file where the entry was identified.
10138 // - Entry 3 -> Mangled name of the function where the entry was
10139 // identified.
10140 // - Entry 4 -> Line in the file where the entry was identified.
10141 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10142 // - Entry 6 -> Order the entry was created.
10143 // The first element of the metadata node is the kind.
10144 Metadata *Ops[] = {
10145 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10146 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10147 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10148 GetMDInt(E.getOrder())};
10149
10150 // Save this entry in the right position of the ordered entries array.
10151 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10152
10153 // Add metadata to the named metadata node.
10154 MD->addOperand(MDNode::get(C, Ops));
10155 };
10156
10157 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10158
10159 // Create function that emits metadata for each device global variable entry;
10160 auto &&DeviceGlobalVarMetadataEmitter =
10161 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10162 StringRef MangledName,
10164 // Generate metadata for global variables. Each entry of this metadata
10165 // contains:
10166 // - Entry 0 -> Kind of this type of metadata (1).
10167 // - Entry 1 -> Mangled name of the variable.
10168 // - Entry 2 -> Declare target kind.
10169 // - Entry 3 -> Order the entry was created.
10170 // The first element of the metadata node is the kind.
10171 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10172 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10173
10174 // Save this entry in the right position of the ordered entries array.
10175 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10176 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10177
10178 // Add metadata to the named metadata node.
10179 MD->addOperand(MDNode::get(C, Ops));
10180 };
10181
10183 DeviceGlobalVarMetadataEmitter);
10184
10185 for (const auto &E : OrderedEntries) {
10186 assert(E.first && "All ordered entries must exist!");
10187 if (const auto *CE =
10188 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
10189 E.first)) {
10190 if (!CE->getID() || !CE->getAddress()) {
10191 // Do not blame the entry if the parent funtion is not emitted.
10192 TargetRegionEntryInfo EntryInfo = E.second;
10193 StringRef FnName = EntryInfo.ParentName;
10194 if (!M.getNamedValue(FnName))
10195 continue;
10196 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10197 continue;
10198 }
10199 createOffloadEntry(CE->getID(), CE->getAddress(),
10200 /*Size=*/0, CE->getFlags(),
10202 } else if (const auto *CE = dyn_cast<
10204 E.first)) {
10207 CE->getFlags());
10208 switch (Flags) {
10212 continue;
10213 if (!CE->getAddress()) {
10214 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10215 continue;
10216 }
10217 // The vaiable has no definition - no need to add the entry.
10218 if (CE->getVarSize() == 0)
10219 continue;
10220 break;
10222 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10223 (!Config.isTargetDevice() && CE->getAddress())) &&
10224 "Declaret target link address is set.");
10225 if (Config.isTargetDevice())
10226 continue;
10227 if (!CE->getAddress()) {
10229 continue;
10230 }
10231 break;
10232 default:
10233 break;
10234 }
10235
10236 // Hidden or internal symbols on the device are not externally visible.
10237 // We should not attempt to register them by creating an offloading
10238 // entry. Indirect variables are handled separately on the device.
10239 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10240 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10242 continue;
10243
10244 // Indirect globals need to use a special name that doesn't match the name
10245 // of the associated host global.
10247 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10248 Flags, CE->getLinkage(), CE->getVarName());
10249 else
10250 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10251 Flags, CE->getLinkage());
10252
10253 } else {
10254 llvm_unreachable("Unsupported entry kind.");
10255 }
10256 }
10257
10258 // Emit requires directive globals to a special entry so the runtime can
10259 // register them when the device image is loaded.
10260 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10261 // entries should be redesigned to better suit this use-case.
10266 ".requires", /*Size=*/0,
10269}
10270
10272 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10273 unsigned FileID, unsigned Line, unsigned Count) {
10275 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10276 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10277 if (Count)
10278 OS << "_" << Count;
10279}
10280
10283 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10285 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10286 EntryInfo.Line, NewCount);
10287}
10288
10291 StringRef ParentName) {
10292 sys::fs::UniqueID ID(0xdeadf17e, 0);
10293 auto FileIDInfo = CallBack();
10294 uint64_t FileID = 0;
10295 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10296 // If the inode ID could not be determined, create a hash value
10297 // the current file name and use that as an ID.
10298 if (EC)
10299 FileID = hash_value(std::get<0>(FileIDInfo));
10300 else
10301 FileID = ID.getFile();
10302
10303 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10304 std::get<1>(FileIDInfo));
10305}
10306
10308 unsigned Offset = 0;
10309 for (uint64_t Remain =
10310 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10312 !(Remain & 1); Remain = Remain >> 1)
10313 Offset++;
10314 return Offset;
10315}
10316
10319 // Rotate by getFlagMemberOffset() bits.
10320 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10321 << getFlagMemberOffset());
10322}
10323
10326 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10327 // If the entry is PTR_AND_OBJ but has not been marked with the special
10328 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10329 // marked as MEMBER_OF.
10330 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10332 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10335 return;
10336
10337 // Reset the placeholder value to prepare the flag for the assignment of the
10338 // proper MEMBER_OF value.
10339 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10340 Flags |= MemberOfFlag;
10341}
10342
10346 bool IsDeclaration, bool IsExternallyVisible,
10347 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10348 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10349 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10350 std::function<Constant *()> GlobalInitializer,
10351 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10352 // TODO: convert this to utilise the IRBuilder Config rather than
10353 // a passed down argument.
10354 if (OpenMPSIMD)
10355 return nullptr;
10356
10359 CaptureClause ==
10362 SmallString<64> PtrName;
10363 {
10364 raw_svector_ostream OS(PtrName);
10365 OS << MangledName;
10366 if (!IsExternallyVisible)
10367 OS << format("_%x", EntryInfo.FileID);
10368 OS << "_decl_tgt_ref_ptr";
10369 }
10370
10371 Value *Ptr = M.getNamedValue(PtrName);
10372
10373 if (!Ptr) {
10374 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10375 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10376
10377 auto *GV = cast<GlobalVariable>(Ptr);
10378 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10379
10380 if (!Config.isTargetDevice()) {
10381 if (GlobalInitializer)
10382 GV->setInitializer(GlobalInitializer());
10383 else
10384 GV->setInitializer(GlobalValue);
10385 }
10386
10388 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10389 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10390 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10391 }
10392
10393 return cast<Constant>(Ptr);
10394 }
10395
10396 return nullptr;
10397}
10398
10402 bool IsDeclaration, bool IsExternallyVisible,
10403 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10404 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10405 std::vector<Triple> TargetTriple,
10406 std::function<Constant *()> GlobalInitializer,
10407 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10408 Constant *Addr) {
10410 (TargetTriple.empty() && !Config.isTargetDevice()))
10411 return;
10412
10414 StringRef VarName;
10415 int64_t VarSize;
10417
10419 CaptureClause ==
10423 VarName = MangledName;
10424 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10425
10426 if (!IsDeclaration)
10427 VarSize = divideCeil(
10429 else
10430 VarSize = 0;
10431 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10432
10433 // This is a workaround carried over from Clang which prevents undesired
10434 // optimisation of internal variables.
10435 if (Config.isTargetDevice() &&
10436 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10437 // Do not create a "ref-variable" if the original is not also available
10438 // on the host.
10440 return;
10441
10442 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10443
10444 if (!M.getNamedValue(RefName)) {
10445 Constant *AddrRef =
10446 getOrCreateInternalVariable(Addr->getType(), RefName);
10447 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10448 GvAddrRef->setConstant(true);
10449 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10450 GvAddrRef->setInitializer(Addr);
10451 GeneratedRefs.push_back(GvAddrRef);
10452 }
10453 }
10454 } else {
10457 else
10459
10460 if (Config.isTargetDevice()) {
10461 VarName = (Addr) ? Addr->getName() : "";
10462 Addr = nullptr;
10463 } else {
10465 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10466 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10467 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10468 VarName = (Addr) ? Addr->getName() : "";
10469 }
10470 VarSize = M.getDataLayout().getPointerSize();
10472 }
10473
10475 Flags, Linkage);
10476}
10477
10478/// Loads all the offload entries information from the host IR
10479/// metadata.
10481 // If we are in target mode, load the metadata from the host IR. This code has
10482 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10483
10485 if (!MD)
10486 return;
10487
10488 for (MDNode *MN : MD->operands()) {
10489 auto &&GetMDInt = [MN](unsigned Idx) {
10490 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10491 return cast<ConstantInt>(V->getValue())->getZExtValue();
10492 };
10493
10494 auto &&GetMDString = [MN](unsigned Idx) {
10495 auto *V = cast<MDString>(MN->getOperand(Idx));
10496 return V->getString();
10497 };
10498
10499 switch (GetMDInt(0)) {
10500 default:
10501 llvm_unreachable("Unexpected metadata!");
10502 break;
10505 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10506 /*DeviceID=*/GetMDInt(1),
10507 /*FileID=*/GetMDInt(2),
10508 /*Line=*/GetMDInt(4),
10509 /*Count=*/GetMDInt(5));
10511 /*Order=*/GetMDInt(6));
10512 break;
10513 }
10517 /*MangledName=*/GetMDString(1),
10519 /*Flags=*/GetMDInt(2)),
10520 /*Order=*/GetMDInt(3));
10521 break;
10522 }
10523 }
10524}
10525
10527 if (HostFilePath.empty())
10528 return;
10529
10530 auto Buf = MemoryBuffer::getFile(HostFilePath);
10531 if (std::error_code Err = Buf.getError()) {
10532 report_fatal_error(("error opening host file from host file path inside of "
10533 "OpenMPIRBuilder: " +
10534 Err.message())
10535 .c_str());
10536 }
10537
10538 LLVMContext Ctx;
10540 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10541 if (std::error_code Err = M.getError()) {
10543 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10544 .c_str());
10545 }
10546
10547 loadOffloadInfoMetadata(*M.get());
10548}
10549
10550//===----------------------------------------------------------------------===//
10551// OffloadEntriesInfoManager
10552//===----------------------------------------------------------------------===//
10553
10555 return OffloadEntriesTargetRegion.empty() &&
10556 OffloadEntriesDeviceGlobalVar.empty();
10557}
10558
10559unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10560 const TargetRegionEntryInfo &EntryInfo) const {
10561 auto It = OffloadEntriesTargetRegionCount.find(
10562 getTargetRegionEntryCountKey(EntryInfo));
10563 if (It == OffloadEntriesTargetRegionCount.end())
10564 return 0;
10565 return It->second;
10566}
10567
10568void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10569 const TargetRegionEntryInfo &EntryInfo) {
10570 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10571 EntryInfo.Count + 1;
10572}
10573
10574/// Initialize target region entry.
10576 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10577 OffloadEntriesTargetRegion[EntryInfo] =
10578 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10579 OMPTargetRegionEntryTargetRegion);
10580 ++OffloadingEntriesNum;
10581}
10582
10586 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10587
10588 // Update the EntryInfo with the next available count for this location.
10589 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10590
10591 // If we are emitting code for a target, the entry is already initialized,
10592 // only has to be registered.
10593 if (OMPBuilder->Config.isTargetDevice()) {
10594 // This could happen if the device compilation is invoked standalone.
10595 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10596 return;
10597 }
10598 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10599 Entry.setAddress(Addr);
10600 Entry.setID(ID);
10601 Entry.setFlags(Flags);
10602 } else {
10604 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10605 return;
10606 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10607 "Target region entry already registered!");
10608 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10609 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10610 ++OffloadingEntriesNum;
10611 }
10612 incrementTargetRegionEntryInfoCount(EntryInfo);
10613}
10614
10616 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10617
10618 // Update the EntryInfo with the next available count for this location.
10619 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10620
10621 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10622 if (It == OffloadEntriesTargetRegion.end()) {
10623 return false;
10624 }
10625 // Fail if this entry is already registered.
10626 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10627 return false;
10628 return true;
10629}
10630
10632 const OffloadTargetRegionEntryInfoActTy &Action) {
10633 // Scan all target region entries and perform the provided action.
10634 for (const auto &It : OffloadEntriesTargetRegion) {
10635 Action(It.first, It.second);
10636 }
10637}
10638
10640 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10641 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10642 ++OffloadingEntriesNum;
10643}
10644
10646 StringRef VarName, Constant *Addr, int64_t VarSize,
10648 if (OMPBuilder->Config.isTargetDevice()) {
10649 // This could happen if the device compilation is invoked standalone.
10650 if (!hasDeviceGlobalVarEntryInfo(VarName))
10651 return;
10652 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10653 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10654 if (Entry.getVarSize() == 0) {
10655 Entry.setVarSize(VarSize);
10656 Entry.setLinkage(Linkage);
10657 }
10658 return;
10659 }
10660 Entry.setVarSize(VarSize);
10661 Entry.setLinkage(Linkage);
10662 Entry.setAddress(Addr);
10663 } else {
10664 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10665 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10666 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10667 "Entry not initialized!");
10668 if (Entry.getVarSize() == 0) {
10669 Entry.setVarSize(VarSize);
10670 Entry.setLinkage(Linkage);
10671 }
10672 return;
10673 }
10675 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10676 Addr, VarSize, Flags, Linkage,
10677 VarName.str());
10678 else
10679 OffloadEntriesDeviceGlobalVar.try_emplace(
10680 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10681 ++OffloadingEntriesNum;
10682 }
10683}
10684
10687 // Scan all target region entries and perform the provided action.
10688 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10689 Action(E.getKey(), E.getValue());
10690}
10691
10692//===----------------------------------------------------------------------===//
10693// CanonicalLoopInfo
10694//===----------------------------------------------------------------------===//
10695
10696void CanonicalLoopInfo::collectControlBlocks(
10698 // We only count those BBs as control block for which we do not need to
10699 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10700 // flow. For consistency, this also means we do not add the Body block, which
10701 // is just the entry to the body code.
10702 BBs.reserve(BBs.size() + 6);
10703 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10704}
10705
10707 assert(isValid() && "Requires a valid canonical loop");
10708 for (BasicBlock *Pred : predecessors(Header)) {
10709 if (Pred != Latch)
10710 return Pred;
10711 }
10712 llvm_unreachable("Missing preheader");
10713}
10714
10715void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10716 assert(isValid() && "Requires a valid canonical loop");
10717
10718 Instruction *CmpI = &getCond()->front();
10719 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10720 CmpI->setOperand(1, TripCount);
10721
10722#ifndef NDEBUG
10723 assertOK();
10724#endif
10725}
10726
10727void CanonicalLoopInfo::mapIndVar(
10728 llvm::function_ref<Value *(Instruction *)> Updater) {
10729 assert(isValid() && "Requires a valid canonical loop");
10730
10731 Instruction *OldIV = getIndVar();
10732
10733 // Record all uses excluding those introduced by the updater. Uses by the
10734 // CanonicalLoopInfo itself to keep track of the number of iterations are
10735 // excluded.
10736 SmallVector<Use *> ReplacableUses;
10737 for (Use &U : OldIV->uses()) {
10738 auto *User = dyn_cast<Instruction>(U.getUser());
10739 if (!User)
10740 continue;
10741 if (User->getParent() == getCond())
10742 continue;
10743 if (User->getParent() == getLatch())
10744 continue;
10745 ReplacableUses.push_back(&U);
10746 }
10747
10748 // Run the updater that may introduce new uses
10749 Value *NewIV = Updater(OldIV);
10750
10751 // Replace the old uses with the value returned by the updater.
10752 for (Use *U : ReplacableUses)
10753 U->set(NewIV);
10754
10755#ifndef NDEBUG
10756 assertOK();
10757#endif
10758}
10759
10761#ifndef NDEBUG
10762 // No constraints if this object currently does not describe a loop.
10763 if (!isValid())
10764 return;
10765
10766 BasicBlock *Preheader = getPreheader();
10767 BasicBlock *Body = getBody();
10768 BasicBlock *After = getAfter();
10769
10770 // Verify standard control-flow we use for OpenMP loops.
10771 assert(Preheader);
10772 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10773 "Preheader must terminate with unconditional branch");
10774 assert(Preheader->getSingleSuccessor() == Header &&
10775 "Preheader must jump to header");
10776
10777 assert(Header);
10778 assert(isa<BranchInst>(Header->getTerminator()) &&
10779 "Header must terminate with unconditional branch");
10780 assert(Header->getSingleSuccessor() == Cond &&
10781 "Header must jump to exiting block");
10782
10783 assert(Cond);
10784 assert(Cond->getSinglePredecessor() == Header &&
10785 "Exiting block only reachable from header");
10786
10787 assert(isa<BranchInst>(Cond->getTerminator()) &&
10788 "Exiting block must terminate with conditional branch");
10789 assert(size(successors(Cond)) == 2 &&
10790 "Exiting block must have two successors");
10791 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10792 "Exiting block's first successor jump to the body");
10793 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10794 "Exiting block's second successor must exit the loop");
10795
10796 assert(Body);
10797 assert(Body->getSinglePredecessor() == Cond &&
10798 "Body only reachable from exiting block");
10799 assert(!isa<PHINode>(Body->front()));
10800
10801 assert(Latch);
10802 assert(isa<BranchInst>(Latch->getTerminator()) &&
10803 "Latch must terminate with unconditional branch");
10804 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10805 // TODO: To support simple redirecting of the end of the body code that has
10806 // multiple; introduce another auxiliary basic block like preheader and after.
10807 assert(Latch->getSinglePredecessor() != nullptr);
10808 assert(!isa<PHINode>(Latch->front()));
10809
10810 assert(Exit);
10811 assert(isa<BranchInst>(Exit->getTerminator()) &&
10812 "Exit block must terminate with unconditional branch");
10813 assert(Exit->getSingleSuccessor() == After &&
10814 "Exit block must jump to after block");
10815
10816 assert(After);
10817 assert(After->getSinglePredecessor() == Exit &&
10818 "After block only reachable from exit block");
10819 assert(After->empty() || !isa<PHINode>(After->front()));
10820
10821 Instruction *IndVar = getIndVar();
10822 assert(IndVar && "Canonical induction variable not found?");
10823 assert(isa<IntegerType>(IndVar->getType()) &&
10824 "Induction variable must be an integer");
10825 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10826 "Induction variable must be a PHI in the loop header");
10827 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10828 assert(
10829 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10830 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10831
10832 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10833 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10834 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10835 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10836 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10837 ->isOne());
10838
10839 Value *TripCount = getTripCount();
10840 assert(TripCount && "Loop trip count not found?");
10841 assert(IndVar->getType() == TripCount->getType() &&
10842 "Trip count and induction variable must have the same type");
10843
10844 auto *CmpI = cast<CmpInst>(&Cond->front());
10845 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10846 "Exit condition must be a signed less-than comparison");
10847 assert(CmpI->getOperand(0) == IndVar &&
10848 "Exit condition must compare the induction variable");
10849 assert(CmpI->getOperand(1) == TripCount &&
10850 "Exit condition must compare with the trip count");
10851#endif
10852}
10853
10855 Header = nullptr;
10856 Cond = nullptr;
10857 Latch = nullptr;
10858 Exit = nullptr;
10859}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:546
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:64
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:128
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:101
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:121
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:106
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:132
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:97
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:473
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition: Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
Class to represent array types.
Definition: DerivedTypes.h:398
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:657
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition: Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:777
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
Definition: Instructions.h:765
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:781
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
Definition: Instructions.h:761
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
LLVM_ABI AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
LLVM_ABI AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
LLVM_ABI AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:619
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:944
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:929
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:646
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:393
reverse_iterator rbegin()
Definition: BasicBlock.h:475
bool empty() const
Definition: BasicBlock.h:481
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:337
const Instruction & front() const
Definition: BasicBlock.h:482
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:354
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:555
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:475
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:437
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:445
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:467
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:235
reverse_iterator rend()
Definition: BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:131
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:662
const Instruction & back() const
Definition: BasicBlock.h:484
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:248
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:494
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1956
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1267
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1273
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:47
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:87
LLVM_ABI void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
LLVM_ABI Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
LLVM_ABI void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
LLVM_ABI bool isEligible() const
Test whether this code extractor is eligible.
LLVM_ABI void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1314
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2989
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2246
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
Definition: Constants.cpp:2240
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2261
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
Definition: Constants.cpp:2489
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2340
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1833
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1380
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
Debug location.
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:248
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:230
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI unsigned getPointerSize(unsigned AS=0) const
The pointer representation size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:738
unsigned getIndexSizeInBits(unsigned AS) const
The size in bits of indices used for address calculation in getelementptr and for addresses in the gi...
Definition: DataLayout.h:398
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition: DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:384
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition: Error.h:159
static ErrorSuccess success()
Create a success value.
Definition: Error.h:336
Tagged union holding either a T or a Error.
Definition: Error.h:485
Error takeError()
Take ownership of the stored error.
Definition: Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:166
const BasicBlock & getEntryBlock() const
Definition: Function.h:807
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
const Function & getFunction() const
Definition: Function.h:164
iterator begin()
Definition: Function.h:851
arg_iterator arg_begin()
Definition: Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:753
size_t arg_size() const
Definition: Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:214
iterator end()
Definition: Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:274
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1605
LinkageTypes getLinkage() const
Definition: GlobalValue.h:548
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:539
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
void setDSOLocal(bool Local)
Definition: GlobalValue.h:305
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:296
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:70
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:256
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:56
Type * getValueType() const
Definition: GlobalValue.h:298
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:291
BasicBlock * getBlock() const
Definition: IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:497
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1450
LLVM_ABI Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1027
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2345
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1898
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1830
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2353
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:595
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2100
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:687
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1339
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2251
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
LLVM_ABI CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1240
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2128
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:2029
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:617
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2142
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1412
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:567
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2263
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1454
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2333
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1416
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:557
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1781
LLVM_ABI CallInst * CreateMalloc(Type *IntPtrTy, Type *AllocTy, Value *AllocSize, Value *ArraySize, ArrayRef< OperandBundleDef > OpB, Function *MallocF=nullptr, const Twine &Name="")
Definition: IRBuilder.cpp:259
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:311
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
LLVM_ABI CallInst * CreateFree(Value *Source, ArrayRef< OperandBundleDef > Bundles={})
Generate the IR for a call to the builtin free function.
Definition: IRBuilder.cpp:311
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1220
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2329
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:172
LLVM_ABI DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:63
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:533
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1197
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
LLVMContext & getContext() const
Definition: IRBuilder.h:203
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1551
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1167
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1970
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:2016
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1860
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1463
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2651
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1911
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1191
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:196
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2361
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:517
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2341
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:323
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2646
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:600
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1599
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:552
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1480
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2115
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2209
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
LLVM_ABI GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:43
Value * CreateNUWSub(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1433
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:90
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:78
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:428
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1718
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:510
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Value * getPointerOperand()
Definition: Instructions.h:259
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:981
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:119
Metadata node.
Definition: Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1078
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1573
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1443
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:56
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:281
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:295
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:285
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:229
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:269
iterator_range< global_iterator > globals()
Definition: Module.h:684
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:596
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:430
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:171
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:302
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:445
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:278
A tuple of MDNodes.
Definition: Metadata.h:1753
iterator_range< op_iterator > operands()
Definition: Metadata.h:1849
LLVM_ABI void addOperand(MDNode *M)
Definition: Metadata.cpp:1471
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:255
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:257
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:390
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:392
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:308
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:310
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:299
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:370
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:376
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:382
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:380
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:374
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:372
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:444
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:104
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:200
StringRef separator() const
Definition: OMPIRBuilder.h:186
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:176
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:117
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:159
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
LLVM_ABI bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:153
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:196
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:485
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:560
LLVM_ABI void emitBranch(BasicBlock *Target)
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false)
Generator for '#omp target'.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for '#omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for '#omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:536
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:539
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:700
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition: SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:247
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:499
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
void setAlignment(Align Align)
Definition: Instructions.h:342
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:369
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:710
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:461
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:626
Class to represent struct types.
Definition: DerivedTypes.h:218
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:620
Type * getElementType(unsigned N) const
Definition: DerivedTypes.h:369
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:1038
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1100
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:409
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1110
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:261
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
LLVM_ABI Type * getStructElementType(unsigned N) const
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:132
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:148
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
User * user_back()
Definition: Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:188
bool use_empty() const
Definition: Value.h:346
user_iterator user_end()
Definition: Value.h:410
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ Exit
Definition: COFF.h:863
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
Definition: CallingConv.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:198
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:255
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:286
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:270
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:870
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:68
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:1128
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:22
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:662
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:214
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61