LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
60
61#include <cstdint>
62#include <optional>
63
64#define DEBUG_TYPE "openmp-ir-builder"
65
66using namespace llvm;
67using namespace omp;
68
69static cl::opt<bool>
70 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
71 cl::desc("Use optimistic attributes describing "
72 "'as-if' properties of runtime calls."),
73 cl::init(false));
74
76 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
77 cl::desc("Factor for the unroll threshold to account for code "
78 "simplifications still taking place"),
79 cl::init(1.5));
80
81#ifndef NDEBUG
82/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
83/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
84/// an InsertPoint stores the instruction before something is inserted. For
85/// instance, if both point to the same instruction, two IRBuilders alternating
86/// creating instruction will cause the instructions to be interleaved.
89 if (!IP1.isSet() || !IP2.isSet())
90 return false;
91 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
92}
93
95 // Valid ordered/unordered and base algorithm combinations.
96 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
97 case OMPScheduleType::UnorderedStaticChunked:
98 case OMPScheduleType::UnorderedStatic:
99 case OMPScheduleType::UnorderedDynamicChunked:
100 case OMPScheduleType::UnorderedGuidedChunked:
101 case OMPScheduleType::UnorderedRuntime:
102 case OMPScheduleType::UnorderedAuto:
103 case OMPScheduleType::UnorderedTrapezoidal:
104 case OMPScheduleType::UnorderedGreedy:
105 case OMPScheduleType::UnorderedBalanced:
106 case OMPScheduleType::UnorderedGuidedIterativeChunked:
107 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
108 case OMPScheduleType::UnorderedSteal:
109 case OMPScheduleType::UnorderedStaticBalancedChunked:
110 case OMPScheduleType::UnorderedGuidedSimd:
111 case OMPScheduleType::UnorderedRuntimeSimd:
112 case OMPScheduleType::OrderedStaticChunked:
113 case OMPScheduleType::OrderedStatic:
114 case OMPScheduleType::OrderedDynamicChunked:
115 case OMPScheduleType::OrderedGuidedChunked:
116 case OMPScheduleType::OrderedRuntime:
117 case OMPScheduleType::OrderedAuto:
118 case OMPScheduleType::OrderdTrapezoidal:
119 case OMPScheduleType::NomergeUnorderedStaticChunked:
120 case OMPScheduleType::NomergeUnorderedStatic:
121 case OMPScheduleType::NomergeUnorderedDynamicChunked:
122 case OMPScheduleType::NomergeUnorderedGuidedChunked:
123 case OMPScheduleType::NomergeUnorderedRuntime:
124 case OMPScheduleType::NomergeUnorderedAuto:
125 case OMPScheduleType::NomergeUnorderedTrapezoidal:
126 case OMPScheduleType::NomergeUnorderedGreedy:
127 case OMPScheduleType::NomergeUnorderedBalanced:
128 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
129 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
130 case OMPScheduleType::NomergeUnorderedSteal:
131 case OMPScheduleType::NomergeOrderedStaticChunked:
132 case OMPScheduleType::NomergeOrderedStatic:
133 case OMPScheduleType::NomergeOrderedDynamicChunked:
134 case OMPScheduleType::NomergeOrderedGuidedChunked:
135 case OMPScheduleType::NomergeOrderedRuntime:
136 case OMPScheduleType::NomergeOrderedAuto:
137 case OMPScheduleType::NomergeOrderedTrapezoidal:
138 break;
139 default:
140 return false;
141 }
142
143 // Must not set both monotonicity modifiers at the same time.
144 OMPScheduleType MonotonicityFlags =
145 SchedType & OMPScheduleType::MonotonicityMask;
146 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
147 return false;
148
149 return true;
150}
151#endif
152
153/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
154/// debug location to the last instruction in the specified basic block if the
155/// insert point points to the end of the block.
158 Builder.restoreIP(IP);
159 llvm::BasicBlock *BB = Builder.GetInsertBlock();
160 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
161 if (!BB->empty() && I == BB->end())
162 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
163}
164
165static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
166 if (T.isAMDGPU()) {
167 StringRef Features =
168 Kernel->getFnAttribute("target-features").getValueAsString();
169 if (Features.count("+wavefrontsize64"))
172 }
173 if (T.isNVPTX())
175 if (T.isSPIRV())
177 llvm_unreachable("No grid value available for this architecture!");
178}
179
180/// Determine which scheduling algorithm to use, determined from schedule clause
181/// arguments.
182static OMPScheduleType
183getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
184 bool HasSimdModifier) {
185 // Currently, the default schedule it static.
186 switch (ClauseKind) {
187 case OMP_SCHEDULE_Default:
188 case OMP_SCHEDULE_Static:
189 return HasChunks ? OMPScheduleType::BaseStaticChunked
190 : OMPScheduleType::BaseStatic;
191 case OMP_SCHEDULE_Dynamic:
192 return OMPScheduleType::BaseDynamicChunked;
193 case OMP_SCHEDULE_Guided:
194 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
195 : OMPScheduleType::BaseGuidedChunked;
196 case OMP_SCHEDULE_Auto:
198 case OMP_SCHEDULE_Runtime:
199 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
200 : OMPScheduleType::BaseRuntime;
201 }
202 llvm_unreachable("unhandled schedule clause argument");
203}
204
205/// Adds ordering modifier flags to schedule type.
206static OMPScheduleType
208 bool HasOrderedClause) {
209 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
210 OMPScheduleType::None &&
211 "Must not have ordering nor monotonicity flags already set");
212
213 OMPScheduleType OrderingModifier = HasOrderedClause
214 ? OMPScheduleType::ModifierOrdered
215 : OMPScheduleType::ModifierUnordered;
216 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
217
218 // Unsupported combinations
219 if (OrderingScheduleType ==
220 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
221 return OMPScheduleType::OrderedGuidedChunked;
222 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
223 OMPScheduleType::ModifierOrdered))
224 return OMPScheduleType::OrderedRuntime;
225
226 return OrderingScheduleType;
227}
228
229/// Adds monotonicity modifier flags to schedule type.
230static OMPScheduleType
232 bool HasSimdModifier, bool HasMonotonic,
233 bool HasNonmonotonic, bool HasOrderedClause) {
234 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
235 OMPScheduleType::None &&
236 "Must not have monotonicity flags already set");
237 assert((!HasMonotonic || !HasNonmonotonic) &&
238 "Monotonic and Nonmonotonic are contradicting each other");
239
240 if (HasMonotonic) {
241 return ScheduleType | OMPScheduleType::ModifierMonotonic;
242 } else if (HasNonmonotonic) {
243 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
244 } else {
245 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
246 // If the static schedule kind is specified or if the ordered clause is
247 // specified, and if the nonmonotonic modifier is not specified, the
248 // effect is as if the monotonic modifier is specified. Otherwise, unless
249 // the monotonic modifier is specified, the effect is as if the
250 // nonmonotonic modifier is specified.
251 OMPScheduleType BaseScheduleType =
252 ScheduleType & ~OMPScheduleType::ModifierMask;
253 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
254 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
255 HasOrderedClause) {
256 // The monotonic is used by default in openmp runtime library, so no need
257 // to set it.
258 return ScheduleType;
259 } else {
260 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
261 }
262 }
263}
264
265/// Determine the schedule type using schedule and ordering clause arguments.
266static OMPScheduleType
267computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
268 bool HasSimdModifier, bool HasMonotonicModifier,
269 bool HasNonmonotonicModifier, bool HasOrderedClause) {
270 OMPScheduleType BaseSchedule =
271 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
272 OMPScheduleType OrderedSchedule =
273 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
275 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
276 HasNonmonotonicModifier, HasOrderedClause);
277
279 return Result;
280}
281
282/// Make \p Source branch to \p Target.
283///
284/// Handles two situations:
285/// * \p Source already has an unconditional branch.
286/// * \p Source is a degenerate block (no terminator because the BB is
287/// the current head of the IR construction).
289 if (Instruction *Term = Source->getTerminator()) {
290 auto *Br = cast<BranchInst>(Term);
291 assert(!Br->isConditional() &&
292 "BB's terminator must be an unconditional branch (or degenerate)");
293 BasicBlock *Succ = Br->getSuccessor(0);
294 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
295 Br->setSuccessor(0, Target);
296 return;
297 }
298
299 auto *NewBr = BranchInst::Create(Target, Source);
300 NewBr->setDebugLoc(DL);
301}
302
303void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
304 bool CreateBranch, DebugLoc DL) {
305 assert(New->getFirstInsertionPt() == New->begin() &&
306 "Target BB must not have PHI nodes");
307
308 // Move instructions to new block.
309 BasicBlock *Old = IP.getBlock();
310 // If the `Old` block is empty then there are no instructions to move. But in
311 // the new debug scheme, it could have trailing debug records which will be
312 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
313 // reasons:
314 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
315 // 2. Even if `New` is not empty, the rationale to move those records to `New`
316 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
317 // assumes that `Old` is optimized out and is going away. This is not the case
318 // here. The `Old` block is still being used e.g. a branch instruction is
319 // added to it later in this function.
320 // So we call `BasicBlock::splice` only when `Old` is not empty.
321 if (!Old->empty())
322 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
323
324 if (CreateBranch) {
325 auto *NewBr = BranchInst::Create(New, Old);
326 NewBr->setDebugLoc(DL);
327 }
328}
329
330void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
331 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
332 BasicBlock *Old = Builder.GetInsertBlock();
333
334 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
335 if (CreateBranch)
336 Builder.SetInsertPoint(Old->getTerminator());
337 else
338 Builder.SetInsertPoint(Old);
339
340 // SetInsertPoint also updates the Builder's debug location, but we want to
341 // keep the one the Builder was configured to use.
342 Builder.SetCurrentDebugLocation(DebugLoc);
343}
344
345BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
346 DebugLoc DL, llvm::Twine Name) {
347 BasicBlock *Old = IP.getBlock();
349 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
350 Old->getParent(), Old->getNextNode());
351 spliceBB(IP, New, CreateBranch, DL);
352 New->replaceSuccessorsPhiUsesWith(Old, New);
353 return New;
354}
355
356BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
357 llvm::Twine Name) {
358 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
359 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
360 if (CreateBranch)
361 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
362 else
363 Builder.SetInsertPoint(Builder.GetInsertBlock());
364 // SetInsertPoint also updates the Builder's debug location, but we want to
365 // keep the one the Builder was configured to use.
366 Builder.SetCurrentDebugLocation(DebugLoc);
367 return New;
368}
369
370BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
371 llvm::Twine Name) {
372 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
373 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
374 if (CreateBranch)
375 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
376 else
377 Builder.SetInsertPoint(Builder.GetInsertBlock());
378 // SetInsertPoint also updates the Builder's debug location, but we want to
379 // keep the one the Builder was configured to use.
380 Builder.SetCurrentDebugLocation(DebugLoc);
381 return New;
382}
383
384BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
385 llvm::Twine Suffix) {
386 BasicBlock *Old = Builder.GetInsertBlock();
387 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
388}
389
390// This function creates a fake integer value and a fake use for the integer
391// value. It returns the fake value created. This is useful in modeling the
392// extra arguments to the outlined functions.
394 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
396 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
397 const Twine &Name = "", bool AsPtr = true) {
398 Builder.restoreIP(OuterAllocaIP);
399 Instruction *FakeVal;
400 AllocaInst *FakeValAddr =
401 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
402 ToBeDeleted.push_back(FakeValAddr);
403
404 if (AsPtr) {
405 FakeVal = FakeValAddr;
406 } else {
407 FakeVal =
408 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
409 ToBeDeleted.push_back(FakeVal);
410 }
411
412 // Generate a fake use of this value
413 Builder.restoreIP(InnerAllocaIP);
414 Instruction *UseFakeVal;
415 if (AsPtr) {
416 UseFakeVal =
417 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
418 } else {
419 UseFakeVal =
420 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
421 }
422 ToBeDeleted.push_back(UseFakeVal);
423 return FakeVal;
424}
425
426//===----------------------------------------------------------------------===//
427// OpenMPIRBuilderConfig
428//===----------------------------------------------------------------------===//
429
430namespace {
432/// Values for bit flags for marking which requires clauses have been used.
433enum OpenMPOffloadingRequiresDirFlags {
434 /// flag undefined.
435 OMP_REQ_UNDEFINED = 0x000,
436 /// no requires directive present.
437 OMP_REQ_NONE = 0x001,
438 /// reverse_offload clause.
439 OMP_REQ_REVERSE_OFFLOAD = 0x002,
440 /// unified_address clause.
441 OMP_REQ_UNIFIED_ADDRESS = 0x004,
442 /// unified_shared_memory clause.
443 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
444 /// dynamic_allocators clause.
445 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
446 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
447};
448
449} // anonymous namespace
450
451OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
452 : RequiresFlags(OMP_REQ_UNDEFINED) {}
453
454OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
455 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
456 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
457 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
458 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
459 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
460 RequiresFlags(OMP_REQ_UNDEFINED) {
461 if (HasRequiresReverseOffload)
462 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
463 if (HasRequiresUnifiedAddress)
464 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
465 if (HasRequiresUnifiedSharedMemory)
466 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
467 if (HasRequiresDynamicAllocators)
468 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
469}
470
471bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
472 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
473}
474
475bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
476 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
477}
478
479bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
480 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
483bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
484 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
485}
486
487int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
488 return hasRequiresFlags() ? RequiresFlags
489 : static_cast<int64_t>(OMP_REQ_NONE);
490}
491
492void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
493 if (Value)
494 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
495 else
496 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
497}
498
499void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
500 if (Value)
501 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
502 else
503 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
504}
505
506void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
507 if (Value)
508 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
509 else
510 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
511}
512
513void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
514 if (Value)
515 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
516 else
517 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
518}
519
520//===----------------------------------------------------------------------===//
521// OpenMPIRBuilder
522//===----------------------------------------------------------------------===//
523
524void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
525 IRBuilderBase &Builder,
526 SmallVector<Value *> &ArgsVector) {
527 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
528 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
529 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
530 constexpr const size_t MaxDim = 3;
531 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
532 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
533
534 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
535
536 Value *NumTeams3D =
537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
538 Value *NumThreads3D =
539 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
540 for (unsigned I :
541 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
542 NumTeams3D =
543 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
544 for (unsigned I :
545 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
546 NumThreads3D =
547 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
548
549 ArgsVector = {Version,
550 PointerNum,
551 KernelArgs.RTArgs.BasePointersArray,
552 KernelArgs.RTArgs.PointersArray,
553 KernelArgs.RTArgs.SizesArray,
554 KernelArgs.RTArgs.MapTypesArray,
555 KernelArgs.RTArgs.MapNamesArray,
556 KernelArgs.RTArgs.MappersArray,
557 KernelArgs.NumIterations,
558 Flags,
559 NumTeams3D,
560 NumThreads3D,
561 KernelArgs.DynCGGroupMem};
562}
563
564void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
565 LLVMContext &Ctx = Fn.getContext();
566
567 // Get the function's current attributes.
568 auto Attrs = Fn.getAttributes();
569 auto FnAttrs = Attrs.getFnAttrs();
570 auto RetAttrs = Attrs.getRetAttrs();
572 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
573 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
574
575 // Add AS to FnAS while taking special care with integer extensions.
576 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
577 bool Param = true) -> void {
578 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
579 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
580 if (HasSignExt || HasZeroExt) {
581 assert(AS.getNumAttributes() == 1 &&
582 "Currently not handling extension attr combined with others.");
583 if (Param) {
584 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
585 FnAS = FnAS.addAttribute(Ctx, AK);
586 } else if (auto AK =
587 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
588 FnAS = FnAS.addAttribute(Ctx, AK);
589 } else {
590 FnAS = FnAS.addAttributes(Ctx, AS);
591 }
592 };
593
594#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
595#include "llvm/Frontend/OpenMP/OMPKinds.def"
596
597 // Add attributes to the function declaration.
598 switch (FnID) {
599#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
600 case Enum: \
601 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
602 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
603 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
604 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
605 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
606 break;
607#include "llvm/Frontend/OpenMP/OMPKinds.def"
608 default:
609 // Attributes are optional.
610 break;
611 }
612}
613
615OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
616 FunctionType *FnTy = nullptr;
617 Function *Fn = nullptr;
618
619 // Try to find the declation in the module first.
620 switch (FnID) {
621#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
622 case Enum: \
623 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
624 IsVarArg); \
625 Fn = M.getFunction(Str); \
626 break;
627#include "llvm/Frontend/OpenMP/OMPKinds.def"
628 }
629
630 if (!Fn) {
631 // Create a new declaration if we need one.
632 switch (FnID) {
633#define OMP_RTL(Enum, Str, ...) \
634 case Enum: \
635 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
636 break;
637#include "llvm/Frontend/OpenMP/OMPKinds.def"
638 }
639
640 // Add information if the runtime function takes a callback function
641 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
642 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
643 LLVMContext &Ctx = Fn->getContext();
644 MDBuilder MDB(Ctx);
645 // Annotate the callback behavior of the runtime function:
646 // - The callback callee is argument number 2 (microtask).
647 // - The first two arguments of the callback callee are unknown (-1).
648 // - All variadic arguments to the runtime function are passed to the
649 // callback callee.
650 Fn->addMetadata(
651 LLVMContext::MD_callback,
652 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
653 2, {-1, -1}, /* VarArgsArePassed */ true)}));
654 }
655 }
656
657 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
658 << " with type " << *Fn->getFunctionType() << "\n");
659 addAttributes(FnID, *Fn);
660
661 } else {
662 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
663 << " with type " << *Fn->getFunctionType() << "\n");
664 }
665
666 assert(Fn && "Failed to create OpenMP runtime function");
667
668 return {FnTy, Fn};
669}
670
671Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
672 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
673 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
674 assert(Fn && "Failed to create OpenMP runtime function pointer");
675 return Fn;
676}
677
678void OpenMPIRBuilder::initialize() { initializeTypes(M); }
679
682 BasicBlock &EntryBlock = Function->getEntryBlock();
683 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
684
685 // Loop over blocks looking for constant allocas, skipping the entry block
686 // as any allocas there are already in the desired location.
687 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
688 Block++) {
689 for (auto Inst = Block->getReverseIterator()->begin();
690 Inst != Block->getReverseIterator()->end();) {
692 Inst++;
694 continue;
695 AllocaInst->moveBeforePreserving(MoveLocInst);
696 } else {
697 Inst++;
698 }
699 }
700 }
701}
702
703void OpenMPIRBuilder::finalize(Function *Fn) {
704 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
706 SmallVector<OutlineInfo, 16> DeferredOutlines;
707 for (OutlineInfo &OI : OutlineInfos) {
708 // Skip functions that have not finalized yet; may happen with nested
709 // function generation.
710 if (Fn && OI.getFunction() != Fn) {
711 DeferredOutlines.push_back(OI);
712 continue;
713 }
714
715 ParallelRegionBlockSet.clear();
716 Blocks.clear();
717 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
718
719 Function *OuterFn = OI.getFunction();
720 CodeExtractorAnalysisCache CEAC(*OuterFn);
721 // If we generate code for the target device, we need to allocate
722 // struct for aggregate params in the device default alloca address space.
723 // OpenMP runtime requires that the params of the extracted functions are
724 // passed as zero address space pointers. This flag ensures that
725 // CodeExtractor generates correct code for extracted functions
726 // which are used by OpenMP runtime.
727 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
728 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
729 /* AggregateArgs */ true,
730 /* BlockFrequencyInfo */ nullptr,
731 /* BranchProbabilityInfo */ nullptr,
732 /* AssumptionCache */ nullptr,
733 /* AllowVarArgs */ true,
734 /* AllowAlloca */ true,
735 /* AllocaBlock*/ OI.OuterAllocaBB,
736 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
737
738 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
739 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
740 << " Exit: " << OI.ExitBB->getName() << "\n");
741 assert(Extractor.isEligible() &&
742 "Expected OpenMP outlining to be possible!");
743
744 for (auto *V : OI.ExcludeArgsFromAggregate)
745 Extractor.excludeArgFromAggregate(V);
746
747 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
748
749 // Forward target-cpu, target-features attributes to the outlined function.
750 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
751 if (TargetCpuAttr.isStringAttribute())
752 OutlinedFn->addFnAttr(TargetCpuAttr);
753
754 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
755 if (TargetFeaturesAttr.isStringAttribute())
756 OutlinedFn->addFnAttr(TargetFeaturesAttr);
757
758 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
759 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
760 assert(OutlinedFn->getReturnType()->isVoidTy() &&
761 "OpenMP outlined functions should not return a value!");
762
763 // For compability with the clang CG we move the outlined function after the
764 // one with the parallel region.
765 OutlinedFn->removeFromParent();
766 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
767
768 // Remove the artificial entry introduced by the extractor right away, we
769 // made our own entry block after all.
770 {
771 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
772 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
773 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
774 // Move instructions from the to-be-deleted ArtificialEntry to the entry
775 // basic block of the parallel region. CodeExtractor generates
776 // instructions to unwrap the aggregate argument and may sink
777 // allocas/bitcasts for values that are solely used in the outlined region
778 // and do not escape.
779 assert(!ArtificialEntry.empty() &&
780 "Expected instructions to add in the outlined region entry");
781 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
782 End = ArtificialEntry.rend();
783 It != End;) {
784 Instruction &I = *It;
785 It++;
786
787 if (I.isTerminator()) {
788 // Absorb any debug value that terminator may have
789 if (OI.EntryBB->getTerminator())
790 OI.EntryBB->getTerminator()->adoptDbgRecords(
791 &ArtificialEntry, I.getIterator(), false);
792 continue;
793 }
794
795 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
796 }
797
798 OI.EntryBB->moveBefore(&ArtificialEntry);
799 ArtificialEntry.eraseFromParent();
800 }
801 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
802 assert(OutlinedFn && OutlinedFn->hasNUses(1));
803
804 // Run a user callback, e.g. to add attributes.
805 if (OI.PostOutlineCB)
806 OI.PostOutlineCB(*OutlinedFn);
807 }
808
809 // Remove work items that have been completed.
810 OutlineInfos = std::move(DeferredOutlines);
811
812 // The createTarget functions embeds user written code into
813 // the target region which may inject allocas which need to
814 // be moved to the entry block of our target or risk malformed
815 // optimisations by later passes, this is only relevant for
816 // the device pass which appears to be a little more delicate
817 // when it comes to optimisations (however, we do not block on
818 // that here, it's up to the inserter to the list to do so).
819 // This notbaly has to occur after the OutlinedInfo candidates
820 // have been extracted so we have an end product that will not
821 // be implicitly adversely affected by any raises unless
822 // intentionally appended to the list.
823 // NOTE: This only does so for ConstantData, it could be extended
824 // to ConstantExpr's with further effort, however, they should
825 // largely be folded when they get here. Extending it to runtime
826 // defined/read+writeable allocation sizes would be non-trivial
827 // (need to factor in movement of any stores to variables the
828 // allocation size depends on, as well as the usual loads,
829 // otherwise it'll yield the wrong result after movement) and
830 // likely be more suitable as an LLVM optimisation pass.
831 for (Function *F : ConstantAllocaRaiseCandidates)
833
834 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
835 [](EmitMetadataErrorKind Kind,
836 const TargetRegionEntryInfo &EntryInfo) -> void {
837 errs() << "Error of kind: " << Kind
838 << " when emitting offload entries and metadata during "
839 "OMPIRBuilder finalization \n";
840 };
841
842 if (!OffloadInfoManager.empty())
843 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
844
845 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
846 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
847 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
848 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
849 }
850
851 IsFinalized = true;
852}
853
854bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
855
856OpenMPIRBuilder::~OpenMPIRBuilder() {
857 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
858}
859
860GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
861 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
862 auto *GV =
863 new GlobalVariable(M, I32Ty,
864 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
865 ConstantInt::get(I32Ty, Value), Name);
866 GV->setVisibility(GlobalValue::HiddenVisibility);
867
868 return GV;
869}
870
871void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
872 if (List.empty())
873 return;
874
875 // Convert List to what ConstantArray needs.
877 UsedArray.resize(List.size());
878 for (unsigned I = 0, E = List.size(); I != E; ++I)
880 cast<Constant>(&*List[I]), Builder.getPtrTy());
881
882 if (UsedArray.empty())
883 return;
884 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
885
886 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
887 ConstantArray::get(ATy, UsedArray), Name);
888
889 GV->setSection("llvm.metadata");
890}
891
893OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
895 auto *Int8Ty = Builder.getInt8Ty();
896 auto *GVMode = new GlobalVariable(
897 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
898 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
899 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
900 return GVMode;
901}
902
903Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
904 uint32_t SrcLocStrSize,
905 IdentFlag LocFlags,
906 unsigned Reserve2Flags) {
907 // Enable "C-mode".
908 LocFlags |= OMP_IDENT_FLAG_KMPC;
909
910 Constant *&Ident =
911 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
912 if (!Ident) {
914 Constant *IdentData[] = {I32Null,
915 ConstantInt::get(Int32, uint32_t(LocFlags)),
916 ConstantInt::get(Int32, Reserve2Flags),
917 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
918
919 size_t SrcLocStrArgIdx = 4;
920 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
922 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
923 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
924 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
925 Constant *Initializer =
926 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
927
928 // Look for existing encoding of the location + flags, not needed but
929 // minimizes the difference to the existing solution while we transition.
930 for (GlobalVariable &GV : M.globals())
931 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
932 if (GV.getInitializer() == Initializer)
933 Ident = &GV;
934
935 if (!Ident) {
936 auto *GV = new GlobalVariable(
937 M, OpenMPIRBuilder::Ident,
938 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
940 M.getDataLayout().getDefaultGlobalsAddressSpace());
941 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
942 GV->setAlignment(Align(8));
943 Ident = GV;
944 }
945 }
946
948}
949
950Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
951 uint32_t &SrcLocStrSize) {
952 SrcLocStrSize = LocStr.size();
953 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
954 if (!SrcLocStr) {
955 Constant *Initializer =
956 ConstantDataArray::getString(M.getContext(), LocStr);
957
958 // Look for existing encoding of the location, not needed but minimizes the
959 // difference to the existing solution while we transition.
960 for (GlobalVariable &GV : M.globals())
961 if (GV.isConstant() && GV.hasInitializer() &&
962 GV.getInitializer() == Initializer)
963 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
964
965 SrcLocStr = Builder.CreateGlobalString(
966 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
967 &M);
968 }
969 return SrcLocStr;
970}
971
972Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
973 StringRef FileName,
974 unsigned Line, unsigned Column,
975 uint32_t &SrcLocStrSize) {
976 SmallString<128> Buffer;
977 Buffer.push_back(';');
978 Buffer.append(FileName);
979 Buffer.push_back(';');
980 Buffer.append(FunctionName);
981 Buffer.push_back(';');
982 Buffer.append(std::to_string(Line));
983 Buffer.push_back(';');
984 Buffer.append(std::to_string(Column));
985 Buffer.push_back(';');
986 Buffer.push_back(';');
987 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
988}
989
990Constant *
991OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
992 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
993 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
994}
995
996Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
997 uint32_t &SrcLocStrSize,
998 Function *F) {
999 DILocation *DIL = DL.get();
1000 if (!DIL)
1001 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1002 StringRef FileName = M.getName();
1003 if (DIFile *DIF = DIL->getFile())
1004 if (std::optional<StringRef> Source = DIF->getSource())
1005 FileName = *Source;
1006 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1007 if (Function.empty() && F)
1008 Function = F->getName();
1009 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1010 DIL->getColumn(), SrcLocStrSize);
1011}
1012
1013Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1014 uint32_t &SrcLocStrSize) {
1015 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1016 Loc.IP.getBlock()->getParent());
1017}
1018
1019Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1020 return Builder.CreateCall(
1021 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1022 "omp_global_thread_num");
1023}
1024
1025OpenMPIRBuilder::InsertPointOrErrorTy
1026OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1027 bool ForceSimpleCall, bool CheckCancelFlag) {
1028 if (!updateToLocation(Loc))
1029 return Loc.IP;
1030
1031 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1032 // __kmpc_barrier(loc, thread_id);
1033
1034 IdentFlag BarrierLocFlags;
1035 switch (Kind) {
1036 case OMPD_for:
1037 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1038 break;
1039 case OMPD_sections:
1040 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1041 break;
1042 case OMPD_single:
1043 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1044 break;
1045 case OMPD_barrier:
1046 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1047 break;
1048 default:
1049 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1050 break;
1051 }
1052
1053 uint32_t SrcLocStrSize;
1054 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1055 Value *Args[] = {
1056 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1057 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1058
1059 // If we are in a cancellable parallel region, barriers are cancellation
1060 // points.
1061 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1062 bool UseCancelBarrier =
1063 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1064
1065 Value *Result =
1066 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1067 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1068 : OMPRTL___kmpc_barrier),
1069 Args);
1070
1071 if (UseCancelBarrier && CheckCancelFlag)
1072 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1073 return Err;
1074
1075 return Builder.saveIP();
1076}
1077
1078OpenMPIRBuilder::InsertPointOrErrorTy
1079OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1080 Value *IfCondition,
1081 omp::Directive CanceledDirective) {
1082 if (!updateToLocation(Loc))
1083 return Loc.IP;
1084
1085 // LLVM utilities like blocks with terminators.
1086 auto *UI = Builder.CreateUnreachable();
1087
1088 Instruction *ThenTI = UI, *ElseTI = nullptr;
1089 if (IfCondition)
1090 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1091 Builder.SetInsertPoint(ThenTI);
1092
1093 Value *CancelKind = nullptr;
1094 switch (CanceledDirective) {
1095#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1096 case DirectiveEnum: \
1097 CancelKind = Builder.getInt32(Value); \
1098 break;
1099#include "llvm/Frontend/OpenMP/OMPKinds.def"
1100 default:
1101 llvm_unreachable("Unknown cancel kind!");
1102 }
1103
1104 uint32_t SrcLocStrSize;
1105 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1106 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1107 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1108 Value *Result = Builder.CreateCall(
1109 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1110 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1111 if (CanceledDirective == OMPD_parallel) {
1112 IRBuilder<>::InsertPointGuard IPG(Builder);
1113 Builder.restoreIP(IP);
1114 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1115 omp::Directive::OMPD_unknown,
1116 /* ForceSimpleCall */ false,
1117 /* CheckCancelFlag */ false)
1118 .takeError();
1119 }
1120 return Error::success();
1121 };
1122
1123 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1124 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1125 return Err;
1126
1127 // Update the insertion point and remove the terminator we introduced.
1128 Builder.SetInsertPoint(UI->getParent());
1129 UI->eraseFromParent();
1130
1131 return Builder.saveIP();
1132}
1133
1134OpenMPIRBuilder::InsertPointOrErrorTy
1135OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1136 omp::Directive CanceledDirective) {
1137 if (!updateToLocation(Loc))
1138 return Loc.IP;
1139
1140 // LLVM utilities like blocks with terminators.
1141 auto *UI = Builder.CreateUnreachable();
1142 Builder.SetInsertPoint(UI);
1143
1144 Value *CancelKind = nullptr;
1145 switch (CanceledDirective) {
1146#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1147 case DirectiveEnum: \
1148 CancelKind = Builder.getInt32(Value); \
1149 break;
1150#include "llvm/Frontend/OpenMP/OMPKinds.def"
1151 default:
1152 llvm_unreachable("Unknown cancel kind!");
1153 }
1154
1155 uint32_t SrcLocStrSize;
1156 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1157 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1158 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1159 Value *Result = Builder.CreateCall(
1160 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1161 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1162 if (CanceledDirective == OMPD_parallel) {
1163 IRBuilder<>::InsertPointGuard IPG(Builder);
1164 Builder.restoreIP(IP);
1165 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1166 omp::Directive::OMPD_unknown,
1167 /* ForceSimpleCall */ false,
1168 /* CheckCancelFlag */ false)
1169 .takeError();
1170 }
1171 return Error::success();
1172 };
1173
1174 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1175 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1176 return Err;
1177
1178 // Update the insertion point and remove the terminator we introduced.
1179 Builder.SetInsertPoint(UI->getParent());
1180 UI->eraseFromParent();
1181
1182 return Builder.saveIP();
1183}
1184
1185OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1186 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1187 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1188 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1189 if (!updateToLocation(Loc))
1190 return Loc.IP;
1191
1192 Builder.restoreIP(AllocaIP);
1193 auto *KernelArgsPtr =
1194 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1195 updateToLocation(Loc);
1196
1197 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1198 llvm::Value *Arg =
1199 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1200 Builder.CreateAlignedStore(
1201 KernelArgs[I], Arg,
1202 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1203 }
1204
1205 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1206 NumThreads, HostPtr, KernelArgsPtr};
1207
1208 Return = Builder.CreateCall(
1209 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1210 OffloadingArgs);
1211
1212 return Builder.saveIP();
1213}
1214
1215OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1216 const LocationDescription &Loc, Value *OutlinedFnID,
1217 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1218 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1219
1220 if (!updateToLocation(Loc))
1221 return Loc.IP;
1222
1223 // On top of the arrays that were filled up, the target offloading call
1224 // takes as arguments the device id as well as the host pointer. The host
1225 // pointer is used by the runtime library to identify the current target
1226 // region, so it only has to be unique and not necessarily point to
1227 // anything. It could be the pointer to the outlined function that
1228 // implements the target region, but we aren't using that so that the
1229 // compiler doesn't need to keep that, and could therefore inline the host
1230 // function if proven worthwhile during optimization.
1231
1232 // From this point on, we need to have an ID of the target region defined.
1233 assert(OutlinedFnID && "Invalid outlined function ID!");
1234 (void)OutlinedFnID;
1235
1236 // Return value of the runtime offloading call.
1237 Value *Return = nullptr;
1238
1239 // Arguments for the target kernel.
1240 SmallVector<Value *> ArgsVector;
1241 getKernelArgsVector(Args, Builder, ArgsVector);
1242
1243 // The target region is an outlined function launched by the runtime
1244 // via calls to __tgt_target_kernel().
1245 //
1246 // Note that on the host and CPU targets, the runtime implementation of
1247 // these calls simply call the outlined function without forking threads.
1248 // The outlined functions themselves have runtime calls to
1249 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1250 // the compiler in emitTeamsCall() and emitParallelCall().
1251 //
1252 // In contrast, on the NVPTX target, the implementation of
1253 // __tgt_target_teams() launches a GPU kernel with the requested number
1254 // of teams and threads so no additional calls to the runtime are required.
1255 // Check the error code and execute the host version if required.
1256 Builder.restoreIP(emitTargetKernel(
1257 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1258 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1259
1260 BasicBlock *OffloadFailedBlock =
1261 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1262 BasicBlock *OffloadContBlock =
1263 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1264 Value *Failed = Builder.CreateIsNotNull(Return);
1265 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1266
1267 auto CurFn = Builder.GetInsertBlock()->getParent();
1268 emitBlock(OffloadFailedBlock, CurFn);
1269 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1270 if (!AfterIP)
1271 return AfterIP.takeError();
1272 Builder.restoreIP(*AfterIP);
1273 emitBranch(OffloadContBlock);
1274 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1275 return Builder.saveIP();
1276}
1277
1278Error OpenMPIRBuilder::emitCancelationCheckImpl(
1279 Value *CancelFlag, omp::Directive CanceledDirective,
1280 FinalizeCallbackTy ExitCB) {
1281 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1282 "Unexpected cancellation!");
1283
1284 // For a cancel barrier we create two new blocks.
1285 BasicBlock *BB = Builder.GetInsertBlock();
1286 BasicBlock *NonCancellationBlock;
1287 if (Builder.GetInsertPoint() == BB->end()) {
1288 // TODO: This branch will not be needed once we moved to the
1289 // OpenMPIRBuilder codegen completely.
1290 NonCancellationBlock = BasicBlock::Create(
1291 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1292 } else {
1293 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1295 Builder.SetInsertPoint(BB);
1296 }
1297 BasicBlock *CancellationBlock = BasicBlock::Create(
1298 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1299
1300 // Jump to them based on the return value.
1301 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1302 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1303 /* TODO weight */ nullptr, nullptr);
1304
1305 // From the cancellation block we finalize all variables and go to the
1306 // post finalization block that is known to the FiniCB callback.
1307 Builder.SetInsertPoint(CancellationBlock);
1308 if (ExitCB)
1309 if (Error Err = ExitCB(Builder.saveIP()))
1310 return Err;
1311 auto &FI = FinalizationStack.back();
1312 if (Error Err = FI.FiniCB(Builder.saveIP()))
1313 return Err;
1314
1315 // The continuation block is where code generation continues.
1316 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1317 return Error::success();
1318}
1319
1320// Callback used to create OpenMP runtime calls to support
1321// omp parallel clause for the device.
1322// We need to use this callback to replace call to the OutlinedFn in OuterFn
1323// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1325 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1326 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1327 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1328 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1329 // Add some known attributes.
1330 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1331 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1332 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1334 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1335 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1336
1337 assert(OutlinedFn.arg_size() >= 2 &&
1338 "Expected at least tid and bounded tid as arguments");
1339 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1340
1341 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1342 assert(CI && "Expected call instruction to outlined function");
1343 CI->getParent()->setName("omp_parallel");
1344
1345 Builder.SetInsertPoint(CI);
1346 Type *PtrTy = OMPIRBuilder->VoidPtr;
1347 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1348
1349 // Add alloca for kernel args
1350 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1351 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1352 AllocaInst *ArgsAlloca =
1353 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1354 Value *Args = ArgsAlloca;
1355 // Add address space cast if array for storing arguments is not allocated
1356 // in address space 0
1357 if (ArgsAlloca->getAddressSpace())
1358 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1359 Builder.restoreIP(CurrentIP);
1360
1361 // Store captured vars which are used by kmpc_parallel_51
1362 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1363 Value *V = *(CI->arg_begin() + 2 + Idx);
1364 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1365 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1366 Builder.CreateStore(V, StoreAddress);
1367 }
1368
1369 Value *Cond =
1370 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1371 : Builder.getInt32(1);
1372
1373 // Build kmpc_parallel_51 call
1374 Value *Parallel51CallArgs[] = {
1375 /* identifier*/ Ident,
1376 /* global thread num*/ ThreadID,
1377 /* if expression */ Cond,
1378 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1379 /* Proc bind */ Builder.getInt32(-1),
1380 /* outlined function */ &OutlinedFn,
1381 /* wrapper function */ NullPtrValue,
1382 /* arguments of the outlined funciton*/ Args,
1383 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1384
1385 FunctionCallee RTLFn =
1386 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1387
1388 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1389
1390 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1391 << *Builder.GetInsertBlock()->getParent() << "\n");
1392
1393 // Initialize the local TID stack location with the argument value.
1394 Builder.SetInsertPoint(PrivTID);
1395 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1396 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1397 PrivTIDAddr);
1398
1399 // Remove redundant call to the outlined function.
1400 CI->eraseFromParent();
1401
1402 for (Instruction *I : ToBeDeleted) {
1403 I->eraseFromParent();
1404 }
1405}
1406
1407// Callback used to create OpenMP runtime calls to support
1408// omp parallel clause for the host.
1409// We need to use this callback to replace call to the OutlinedFn in OuterFn
1410// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1411static void
1412hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1413 Function *OuterFn, Value *Ident, Value *IfCondition,
1414 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1415 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1416 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1417 FunctionCallee RTLFn;
1418 if (IfCondition) {
1419 RTLFn =
1420 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1421 } else {
1422 RTLFn =
1423 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1424 }
1425 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1426 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1427 LLVMContext &Ctx = F->getContext();
1428 MDBuilder MDB(Ctx);
1429 // Annotate the callback behavior of the __kmpc_fork_call:
1430 // - The callback callee is argument number 2 (microtask).
1431 // - The first two arguments of the callback callee are unknown (-1).
1432 // - All variadic arguments to the __kmpc_fork_call are passed to the
1433 // callback callee.
1434 F->addMetadata(LLVMContext::MD_callback,
1436 2, {-1, -1},
1437 /* VarArgsArePassed */ true)}));
1438 }
1439 }
1440 // Add some known attributes.
1441 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1442 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1443 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1444
1445 assert(OutlinedFn.arg_size() >= 2 &&
1446 "Expected at least tid and bounded tid as arguments");
1447 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1448
1449 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1450 CI->getParent()->setName("omp_parallel");
1451 Builder.SetInsertPoint(CI);
1452
1453 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1454 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1455 &OutlinedFn};
1456
1457 SmallVector<Value *, 16> RealArgs;
1458 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1459 if (IfCondition) {
1460 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1461 RealArgs.push_back(Cond);
1462 }
1463 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1464
1465 // __kmpc_fork_call_if always expects a void ptr as the last argument
1466 // If there are no arguments, pass a null pointer.
1467 auto PtrTy = OMPIRBuilder->VoidPtr;
1468 if (IfCondition && NumCapturedVars == 0) {
1469 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1470 RealArgs.push_back(NullPtrValue);
1471 }
1472
1473 Builder.CreateCall(RTLFn, RealArgs);
1474
1475 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1476 << *Builder.GetInsertBlock()->getParent() << "\n");
1477
1478 // Initialize the local TID stack location with the argument value.
1479 Builder.SetInsertPoint(PrivTID);
1480 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1481 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1482 PrivTIDAddr);
1483
1484 // Remove redundant call to the outlined function.
1485 CI->eraseFromParent();
1486
1487 for (Instruction *I : ToBeDeleted) {
1488 I->eraseFromParent();
1489 }
1490}
1491
1492OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1493 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1494 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1495 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1496 omp::ProcBindKind ProcBind, bool IsCancellable) {
1497 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1498
1499 if (!updateToLocation(Loc))
1500 return Loc.IP;
1501
1502 uint32_t SrcLocStrSize;
1503 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1504 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1505 Value *ThreadID = getOrCreateThreadID(Ident);
1506 // If we generate code for the target device, we need to allocate
1507 // struct for aggregate params in the device default alloca address space.
1508 // OpenMP runtime requires that the params of the extracted functions are
1509 // passed as zero address space pointers. This flag ensures that extracted
1510 // function arguments are declared in zero address space
1511 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1512
1513 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1514 // only if we compile for host side.
1515 if (NumThreads && !Config.isTargetDevice()) {
1516 Value *Args[] = {
1517 Ident, ThreadID,
1518 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1519 Builder.CreateCall(
1520 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1521 }
1522
1523 if (ProcBind != OMP_PROC_BIND_default) {
1524 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1525 Value *Args[] = {
1526 Ident, ThreadID,
1527 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1528 Builder.CreateCall(
1529 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1530 }
1531
1532 BasicBlock *InsertBB = Builder.GetInsertBlock();
1533 Function *OuterFn = InsertBB->getParent();
1534
1535 // Save the outer alloca block because the insertion iterator may get
1536 // invalidated and we still need this later.
1537 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1538
1539 // Vector to remember instructions we used only during the modeling but which
1540 // we want to delete at the end.
1542
1543 // Change the location to the outer alloca insertion point to create and
1544 // initialize the allocas we pass into the parallel region.
1545 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1546 Builder.restoreIP(NewOuter);
1547 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1548 AllocaInst *ZeroAddrAlloca =
1549 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1550 Instruction *TIDAddr = TIDAddrAlloca;
1551 Instruction *ZeroAddr = ZeroAddrAlloca;
1552 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1553 // Add additional casts to enforce pointers in zero address space
1554 TIDAddr = new AddrSpaceCastInst(
1555 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1556 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1557 ToBeDeleted.push_back(TIDAddr);
1558 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1559 PointerType ::get(M.getContext(), 0),
1560 "zero.addr.ascast");
1561 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1562 ToBeDeleted.push_back(ZeroAddr);
1563 }
1564
1565 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1566 // associated arguments in the outlined function, so we delete them later.
1567 ToBeDeleted.push_back(TIDAddrAlloca);
1568 ToBeDeleted.push_back(ZeroAddrAlloca);
1569
1570 // Create an artificial insertion point that will also ensure the blocks we
1571 // are about to split are not degenerated.
1572 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1573
1574 BasicBlock *EntryBB = UI->getParent();
1575 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1576 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1577 BasicBlock *PRegPreFiniBB =
1578 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1579 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1580
1581 auto FiniCBWrapper = [&](InsertPointTy IP) {
1582 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1583 // target to the region exit block.
1584 if (IP.getBlock()->end() == IP.getPoint()) {
1585 IRBuilder<>::InsertPointGuard IPG(Builder);
1586 Builder.restoreIP(IP);
1587 Instruction *I = Builder.CreateBr(PRegExitBB);
1588 IP = InsertPointTy(I->getParent(), I->getIterator());
1589 }
1591 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1592 "Unexpected insertion point for finalization call!");
1593 return FiniCB(IP);
1594 };
1595
1596 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1597
1598 // Generate the privatization allocas in the block that will become the entry
1599 // of the outlined function.
1600 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1601 InsertPointTy InnerAllocaIP = Builder.saveIP();
1602
1603 AllocaInst *PrivTIDAddr =
1604 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1605 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1606
1607 // Add some fake uses for OpenMP provided arguments.
1608 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1609 Instruction *ZeroAddrUse =
1610 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1611 ToBeDeleted.push_back(ZeroAddrUse);
1612
1613 // EntryBB
1614 // |
1615 // V
1616 // PRegionEntryBB <- Privatization allocas are placed here.
1617 // |
1618 // V
1619 // PRegionBodyBB <- BodeGen is invoked here.
1620 // |
1621 // V
1622 // PRegPreFiniBB <- The block we will start finalization from.
1623 // |
1624 // V
1625 // PRegionExitBB <- A common exit to simplify block collection.
1626 //
1627
1628 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1629
1630 // Let the caller create the body.
1631 assert(BodyGenCB && "Expected body generation callback!");
1632 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1633 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1634 return Err;
1635
1636 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1637
1638 OutlineInfo OI;
1639 if (Config.isTargetDevice()) {
1640 // Generate OpenMP target specific runtime call
1641 OI.PostOutlineCB = [=, ToBeDeletedVec =
1642 std::move(ToBeDeleted)](Function &OutlinedFn) {
1643 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1644 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1645 ThreadID, ToBeDeletedVec);
1646 };
1647 } else {
1648 // Generate OpenMP host runtime call
1649 OI.PostOutlineCB = [=, ToBeDeletedVec =
1650 std::move(ToBeDeleted)](Function &OutlinedFn) {
1651 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1652 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1653 };
1654 }
1655
1656 OI.OuterAllocaBB = OuterAllocaBlock;
1657 OI.EntryBB = PRegEntryBB;
1658 OI.ExitBB = PRegExitBB;
1659
1660 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1662 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1663
1664 CodeExtractorAnalysisCache CEAC(*OuterFn);
1665 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1666 /* AggregateArgs */ false,
1667 /* BlockFrequencyInfo */ nullptr,
1668 /* BranchProbabilityInfo */ nullptr,
1669 /* AssumptionCache */ nullptr,
1670 /* AllowVarArgs */ true,
1671 /* AllowAlloca */ true,
1672 /* AllocationBlock */ OuterAllocaBlock,
1673 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1674
1675 // Find inputs to, outputs from the code region.
1676 BasicBlock *CommonExit = nullptr;
1677 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1678 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1679
1680 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1681 /*CollectGlobalInputs=*/true);
1682
1683 Inputs.remove_if([&](Value *I) {
1685 return GV->getValueType() == OpenMPIRBuilder::Ident;
1686
1687 return false;
1688 });
1689
1690 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1691
1692 FunctionCallee TIDRTLFn =
1693 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1694
1695 auto PrivHelper = [&](Value &V) -> Error {
1696 if (&V == TIDAddr || &V == ZeroAddr) {
1697 OI.ExcludeArgsFromAggregate.push_back(&V);
1698 return Error::success();
1699 }
1700
1702 for (Use &U : V.uses())
1703 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1704 if (ParallelRegionBlockSet.count(UserI->getParent()))
1705 Uses.insert(&U);
1706
1707 // __kmpc_fork_call expects extra arguments as pointers. If the input
1708 // already has a pointer type, everything is fine. Otherwise, store the
1709 // value onto stack and load it back inside the to-be-outlined region. This
1710 // will ensure only the pointer will be passed to the function.
1711 // FIXME: if there are more than 15 trailing arguments, they must be
1712 // additionally packed in a struct.
1713 Value *Inner = &V;
1714 if (!V.getType()->isPointerTy()) {
1715 IRBuilder<>::InsertPointGuard Guard(Builder);
1716 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1717
1718 Builder.restoreIP(OuterAllocaIP);
1719 Value *Ptr =
1720 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1721
1722 // Store to stack at end of the block that currently branches to the entry
1723 // block of the to-be-outlined region.
1724 Builder.SetInsertPoint(InsertBB,
1725 InsertBB->getTerminator()->getIterator());
1726 Builder.CreateStore(&V, Ptr);
1727
1728 // Load back next to allocations in the to-be-outlined region.
1729 Builder.restoreIP(InnerAllocaIP);
1730 Inner = Builder.CreateLoad(V.getType(), Ptr);
1731 }
1732
1733 Value *ReplacementValue = nullptr;
1734 CallInst *CI = dyn_cast<CallInst>(&V);
1735 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1736 ReplacementValue = PrivTID;
1737 } else {
1738 InsertPointOrErrorTy AfterIP =
1739 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1740 if (!AfterIP)
1741 return AfterIP.takeError();
1742 Builder.restoreIP(*AfterIP);
1743 InnerAllocaIP = {
1744 InnerAllocaIP.getBlock(),
1745 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1746
1747 assert(ReplacementValue &&
1748 "Expected copy/create callback to set replacement value!");
1749 if (ReplacementValue == &V)
1750 return Error::success();
1751 }
1752
1753 for (Use *UPtr : Uses)
1754 UPtr->set(ReplacementValue);
1755
1756 return Error::success();
1757 };
1758
1759 // Reset the inner alloca insertion as it will be used for loading the values
1760 // wrapped into pointers before passing them into the to-be-outlined region.
1761 // Configure it to insert immediately after the fake use of zero address so
1762 // that they are available in the generated body and so that the
1763 // OpenMP-related values (thread ID and zero address pointers) remain leading
1764 // in the argument list.
1765 InnerAllocaIP = IRBuilder<>::InsertPoint(
1766 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1767
1768 // Reset the outer alloca insertion point to the entry of the relevant block
1769 // in case it was invalidated.
1770 OuterAllocaIP = IRBuilder<>::InsertPoint(
1771 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1772
1773 for (Value *Input : Inputs) {
1774 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1775 if (Error Err = PrivHelper(*Input))
1776 return Err;
1777 }
1778 LLVM_DEBUG({
1779 for (Value *Output : Outputs)
1780 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1781 });
1782 assert(Outputs.empty() &&
1783 "OpenMP outlining should not produce live-out values!");
1784
1785 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1786 LLVM_DEBUG({
1787 for (auto *BB : Blocks)
1788 dbgs() << " PBR: " << BB->getName() << "\n";
1789 });
1790
1791 // Adjust the finalization stack, verify the adjustment, and call the
1792 // finalize function a last time to finalize values between the pre-fini
1793 // block and the exit block if we left the parallel "the normal way".
1794 auto FiniInfo = FinalizationStack.pop_back_val();
1795 (void)FiniInfo;
1796 assert(FiniInfo.DK == OMPD_parallel &&
1797 "Unexpected finalization stack state!");
1798
1799 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1800
1801 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1802 if (Error Err = FiniCB(PreFiniIP))
1803 return Err;
1804
1805 // Register the outlined info.
1806 addOutlineInfo(std::move(OI));
1807
1808 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1809 UI->eraseFromParent();
1810
1811 return AfterIP;
1812}
1813
1814void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1815 // Build call void __kmpc_flush(ident_t *loc)
1816 uint32_t SrcLocStrSize;
1817 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1818 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1819
1820 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1821}
1822
1823void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1824 if (!updateToLocation(Loc))
1825 return;
1826 emitFlush(Loc);
1827}
1828
1829void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1830 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1831 // global_tid);
1832 uint32_t SrcLocStrSize;
1833 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1834 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1835 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1836
1837 // Ignore return result until untied tasks are supported.
1838 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1839 Args);
1840}
1841
1842void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1843 if (!updateToLocation(Loc))
1844 return;
1845 emitTaskwaitImpl(Loc);
1846}
1847
1848void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1849 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1850 uint32_t SrcLocStrSize;
1851 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1852 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1854 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1855
1856 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1857 Args);
1858}
1859
1860void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1861 if (!updateToLocation(Loc))
1862 return;
1863 emitTaskyieldImpl(Loc);
1864}
1865
1866// Processes the dependencies in Dependencies and does the following
1867// - Allocates space on the stack of an array of DependInfo objects
1868// - Populates each DependInfo object with relevant information of
1869// the corresponding dependence.
1870// - All code is inserted in the entry block of the current function.
1872 OpenMPIRBuilder &OMPBuilder,
1874 // Early return if we have no dependencies to process
1875 if (Dependencies.empty())
1876 return nullptr;
1877
1878 // Given a vector of DependData objects, in this function we create an
1879 // array on the stack that holds kmp_dep_info objects corresponding
1880 // to each dependency. This is then passed to the OpenMP runtime.
1881 // For example, if there are 'n' dependencies then the following psedo
1882 // code is generated. Assume the first dependence is on a variable 'a'
1883 //
1884 // \code{c}
1885 // DepArray = alloc(n x sizeof(kmp_depend_info);
1886 // idx = 0;
1887 // DepArray[idx].base_addr = ptrtoint(&a);
1888 // DepArray[idx].len = 8;
1889 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1890 // ++idx;
1891 // DepArray[idx].base_addr = ...;
1892 // \endcode
1893
1894 IRBuilderBase &Builder = OMPBuilder.Builder;
1895 Type *DependInfo = OMPBuilder.DependInfo;
1896 Module &M = OMPBuilder.M;
1897
1898 Value *DepArray = nullptr;
1899 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1900 Builder.SetInsertPoint(
1901 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1902
1903 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1904 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1905
1906 Builder.restoreIP(OldIP);
1907
1908 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1909 Value *Base =
1910 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1911 // Store the pointer to the variable
1912 Value *Addr = Builder.CreateStructGEP(
1913 DependInfo, Base,
1914 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1915 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1916 Builder.CreateStore(DepValPtr, Addr);
1917 // Store the size of the variable
1918 Value *Size = Builder.CreateStructGEP(
1919 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1920 Builder.CreateStore(
1921 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1922 Size);
1923 // Store the dependency kind
1924 Value *Flags = Builder.CreateStructGEP(
1925 DependInfo, Base,
1926 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1927 Builder.CreateStore(
1928 ConstantInt::get(Builder.getInt8Ty(),
1929 static_cast<unsigned int>(Dep.DepKind)),
1930 Flags);
1931 }
1932 return DepArray;
1933}
1934
1935OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1936 const LocationDescription &Loc, InsertPointTy AllocaIP,
1937 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1938 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1939 Value *Priority) {
1940
1941 if (!updateToLocation(Loc))
1942 return InsertPointTy();
1943
1944 uint32_t SrcLocStrSize;
1945 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1946 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1947 // The current basic block is split into four basic blocks. After outlining,
1948 // they will be mapped as follows:
1949 // ```
1950 // def current_fn() {
1951 // current_basic_block:
1952 // br label %task.exit
1953 // task.exit:
1954 // ; instructions after task
1955 // }
1956 // def outlined_fn() {
1957 // task.alloca:
1958 // br label %task.body
1959 // task.body:
1960 // ret void
1961 // }
1962 // ```
1963 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1964 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1965 BasicBlock *TaskAllocaBB =
1966 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1967
1968 InsertPointTy TaskAllocaIP =
1969 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1970 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1971 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1972 return Err;
1973
1974 OutlineInfo OI;
1975 OI.EntryBB = TaskAllocaBB;
1976 OI.OuterAllocaBB = AllocaIP.getBlock();
1977 OI.ExitBB = TaskExitBB;
1978
1979 // Add the thread ID argument.
1981 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1982 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1983
1984 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1985 Mergeable, Priority, EventHandle, TaskAllocaBB,
1986 ToBeDeleted](Function &OutlinedFn) mutable {
1987 // Replace the Stale CI by appropriate RTL function call.
1988 assert(OutlinedFn.hasOneUse() &&
1989 "there must be a single user for the outlined function");
1990 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1991
1992 // HasShareds is true if any variables are captured in the outlined region,
1993 // false otherwise.
1994 bool HasShareds = StaleCI->arg_size() > 1;
1995 Builder.SetInsertPoint(StaleCI);
1996
1997 // Gather the arguments for emitting the runtime call for
1998 // @__kmpc_omp_task_alloc
1999 Function *TaskAllocFn =
2000 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2001
2002 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2003 // call.
2004 Value *ThreadID = getOrCreateThreadID(Ident);
2005
2006 // Argument - `flags`
2007 // Task is tied iff (Flags & 1) == 1.
2008 // Task is untied iff (Flags & 1) == 0.
2009 // Task is final iff (Flags & 2) == 2.
2010 // Task is not final iff (Flags & 2) == 0.
2011 // Task is mergeable iff (Flags & 4) == 4.
2012 // Task is not mergeable iff (Flags & 4) == 0.
2013 // Task is priority iff (Flags & 32) == 32.
2014 // Task is not priority iff (Flags & 32) == 0.
2015 // TODO: Handle the other flags.
2016 Value *Flags = Builder.getInt32(Tied);
2017 if (Final) {
2018 Value *FinalFlag =
2019 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2020 Flags = Builder.CreateOr(FinalFlag, Flags);
2021 }
2022
2023 if (Mergeable)
2024 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2025 if (Priority)
2026 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2027
2028 // Argument - `sizeof_kmp_task_t` (TaskSize)
2029 // Tasksize refers to the size in bytes of kmp_task_t data structure
2030 // including private vars accessed in task.
2031 // TODO: add kmp_task_t_with_privates (privates)
2032 Value *TaskSize = Builder.getInt64(
2033 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2034
2035 // Argument - `sizeof_shareds` (SharedsSize)
2036 // SharedsSize refers to the shareds array size in the kmp_task_t data
2037 // structure.
2038 Value *SharedsSize = Builder.getInt64(0);
2039 if (HasShareds) {
2040 AllocaInst *ArgStructAlloca =
2042 assert(ArgStructAlloca &&
2043 "Unable to find the alloca instruction corresponding to arguments "
2044 "for extracted function");
2045 StructType *ArgStructType =
2046 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2047 assert(ArgStructType && "Unable to find struct type corresponding to "
2048 "arguments for extracted function");
2049 SharedsSize =
2050 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2051 }
2052 // Emit the @__kmpc_omp_task_alloc runtime call
2053 // The runtime call returns a pointer to an area where the task captured
2054 // variables must be copied before the task is run (TaskData)
2055 CallInst *TaskData = Builder.CreateCall(
2056 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2057 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2058 /*task_func=*/&OutlinedFn});
2059
2060 // Emit detach clause initialization.
2061 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2062 // task_descriptor);
2063 if (EventHandle) {
2064 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2065 OMPRTL___kmpc_task_allow_completion_event);
2066 llvm::Value *EventVal =
2067 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2068 llvm::Value *EventHandleAddr =
2069 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2070 Builder.getPtrTy(0));
2071 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2072 Builder.CreateStore(EventVal, EventHandleAddr);
2073 }
2074 // Copy the arguments for outlined function
2075 if (HasShareds) {
2076 Value *Shareds = StaleCI->getArgOperand(1);
2077 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2078 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2079 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2080 SharedsSize);
2081 }
2082
2083 if (Priority) {
2084 //
2085 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2086 // we populate the priority information into the "kmp_task_t" here
2087 //
2088 // The struct "kmp_task_t" definition is available in kmp.h
2089 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2090 // data2 is used for priority
2091 //
2092 Type *Int32Ty = Builder.getInt32Ty();
2093 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2094 // kmp_task_t* => { ptr }
2095 Type *TaskPtr = StructType::get(VoidPtr);
2096 Value *TaskGEP =
2097 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2098 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2099 Type *TaskStructType = StructType::get(
2100 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2101 Value *PriorityData = Builder.CreateInBoundsGEP(
2102 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2103 // kmp_cmplrdata_t => { ptr, ptr }
2104 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2105 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2106 PriorityData, {Zero, Zero});
2107 Builder.CreateStore(Priority, CmplrData);
2108 }
2109
2110 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2111
2112 // In the presence of the `if` clause, the following IR is generated:
2113 // ...
2114 // %data = call @__kmpc_omp_task_alloc(...)
2115 // br i1 %if_condition, label %then, label %else
2116 // then:
2117 // call @__kmpc_omp_task(...)
2118 // br label %exit
2119 // else:
2120 // ;; Wait for resolution of dependencies, if any, before
2121 // ;; beginning the task
2122 // call @__kmpc_omp_wait_deps(...)
2123 // call @__kmpc_omp_task_begin_if0(...)
2124 // call @outlined_fn(...)
2125 // call @__kmpc_omp_task_complete_if0(...)
2126 // br label %exit
2127 // exit:
2128 // ...
2129 if (IfCondition) {
2130 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2131 // terminator.
2132 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2133 Instruction *IfTerminator =
2134 Builder.GetInsertPoint()->getParent()->getTerminator();
2135 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2136 Builder.SetInsertPoint(IfTerminator);
2137 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2138 &ElseTI);
2139 Builder.SetInsertPoint(ElseTI);
2140
2141 if (Dependencies.size()) {
2142 Function *TaskWaitFn =
2143 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2144 Builder.CreateCall(
2145 TaskWaitFn,
2146 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2147 ConstantInt::get(Builder.getInt32Ty(), 0),
2149 }
2150 Function *TaskBeginFn =
2151 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2152 Function *TaskCompleteFn =
2153 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2154 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2155 CallInst *CI = nullptr;
2156 if (HasShareds)
2157 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2158 else
2159 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2160 CI->setDebugLoc(StaleCI->getDebugLoc());
2161 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2162 Builder.SetInsertPoint(ThenTI);
2163 }
2164
2165 if (Dependencies.size()) {
2166 Function *TaskFn =
2167 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2168 Builder.CreateCall(
2169 TaskFn,
2170 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2171 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2173
2174 } else {
2175 // Emit the @__kmpc_omp_task runtime call to spawn the task
2176 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2177 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2178 }
2179
2180 StaleCI->eraseFromParent();
2181
2182 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2183 if (HasShareds) {
2184 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2185 OutlinedFn.getArg(1)->replaceUsesWithIf(
2186 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2187 }
2188
2189 for (Instruction *I : llvm::reverse(ToBeDeleted))
2190 I->eraseFromParent();
2191 };
2192
2193 addOutlineInfo(std::move(OI));
2194 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2195
2196 return Builder.saveIP();
2197}
2198
2199OpenMPIRBuilder::InsertPointOrErrorTy
2200OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2201 InsertPointTy AllocaIP,
2202 BodyGenCallbackTy BodyGenCB) {
2203 if (!updateToLocation(Loc))
2204 return InsertPointTy();
2205
2206 uint32_t SrcLocStrSize;
2207 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2208 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2209 Value *ThreadID = getOrCreateThreadID(Ident);
2210
2211 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2212 Function *TaskgroupFn =
2213 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2214 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2215
2216 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2217 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2218 return Err;
2219
2220 Builder.SetInsertPoint(TaskgroupExitBB);
2221 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2222 Function *EndTaskgroupFn =
2223 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2224 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2225
2226 return Builder.saveIP();
2227}
2228
2229OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2230 const LocationDescription &Loc, InsertPointTy AllocaIP,
2231 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2232 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2233 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2234
2235 if (!updateToLocation(Loc))
2236 return Loc.IP;
2237
2238 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2239 // this has not been created yet at some times when this callback runs.
2240 SmallVector<BranchInst *> CancellationBranches;
2241 auto FiniCBWrapper = [&](InsertPointTy IP) {
2242 if (IP.getBlock()->end() != IP.getPoint())
2243 return FiniCB(IP);
2244 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2245 // will fail because that function requires the Finalization Basic Block to
2246 // have a terminator, which is already removed by EmitOMPRegionBody.
2247 // IP is currently at cancelation block.
2248 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2249 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2250 CancellationBranches.push_back(DummyBranch);
2251 return FiniCB(IP);
2252 };
2253
2254 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2255
2256 // Each section is emitted as a switch case
2257 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2258 // -> OMP.createSection() which generates the IR for each section
2259 // Iterate through all sections and emit a switch construct:
2260 // switch (IV) {
2261 // case 0:
2262 // <SectionStmt[0]>;
2263 // break;
2264 // ...
2265 // case <NumSection> - 1:
2266 // <SectionStmt[<NumSection> - 1]>;
2267 // break;
2268 // }
2269 // ...
2270 // section_loop.after:
2271 // <FiniCB>;
2272 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2273 Builder.restoreIP(CodeGenIP);
2275 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2276 Function *CurFn = Continue->getParent();
2277 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2278
2279 unsigned CaseNumber = 0;
2280 for (auto SectionCB : SectionCBs) {
2282 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2283 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2284 Builder.SetInsertPoint(CaseBB);
2285 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2286 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2287 CaseEndBr->getIterator()}))
2288 return Err;
2289 CaseNumber++;
2290 }
2291 // remove the existing terminator from body BB since there can be no
2292 // terminators after switch/case
2293 return Error::success();
2294 };
2295 // Loop body ends here
2296 // LowerBound, UpperBound, and STride for createCanonicalLoop
2297 Type *I32Ty = Type::getInt32Ty(M.getContext());
2298 Value *LB = ConstantInt::get(I32Ty, 0);
2299 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2300 Value *ST = ConstantInt::get(I32Ty, 1);
2301 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2302 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2303 if (!LoopInfo)
2304 return LoopInfo.takeError();
2305
2306 InsertPointOrErrorTy WsloopIP =
2307 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2308 WorksharingLoopType::ForStaticLoop, !IsNowait);
2309 if (!WsloopIP)
2310 return WsloopIP.takeError();
2311 InsertPointTy AfterIP = *WsloopIP;
2312
2313 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2314 assert(LoopFini && "Bad structure of static workshare loop finalization");
2315
2316 // Apply the finalization callback in LoopAfterBB
2317 auto FiniInfo = FinalizationStack.pop_back_val();
2318 assert(FiniInfo.DK == OMPD_sections &&
2319 "Unexpected finalization stack state!");
2320 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2321 Builder.restoreIP(AfterIP);
2322 BasicBlock *FiniBB =
2323 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2324 if (Error Err = CB(Builder.saveIP()))
2325 return Err;
2326 AfterIP = {FiniBB, FiniBB->begin()};
2327 }
2328
2329 // Now we can fix the dummy branch to point to the right place
2330 for (BranchInst *DummyBranch : CancellationBranches) {
2331 assert(DummyBranch->getNumSuccessors() == 1);
2332 DummyBranch->setSuccessor(0, LoopFini);
2333 }
2334
2335 return AfterIP;
2336}
2337
2338OpenMPIRBuilder::InsertPointOrErrorTy
2339OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2340 BodyGenCallbackTy BodyGenCB,
2341 FinalizeCallbackTy FiniCB) {
2342 if (!updateToLocation(Loc))
2343 return Loc.IP;
2344
2345 auto FiniCBWrapper = [&](InsertPointTy IP) {
2346 if (IP.getBlock()->end() != IP.getPoint())
2347 return FiniCB(IP);
2348 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2349 // will fail because that function requires the Finalization Basic Block to
2350 // have a terminator, which is already removed by EmitOMPRegionBody.
2351 // IP is currently at cancelation block.
2352 // We need to backtrack to the condition block to fetch
2353 // the exit block and create a branch from cancelation
2354 // to exit block.
2355 IRBuilder<>::InsertPointGuard IPG(Builder);
2356 Builder.restoreIP(IP);
2357 auto *CaseBB = Loc.IP.getBlock();
2358 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2359 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2360 Instruction *I = Builder.CreateBr(ExitBB);
2361 IP = InsertPointTy(I->getParent(), I->getIterator());
2362 return FiniCB(IP);
2363 };
2364
2365 Directive OMPD = Directive::OMPD_sections;
2366 // Since we are using Finalization Callback here, HasFinalize
2367 // and IsCancellable have to be true
2368 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2369 /*Conditional*/ false, /*hasFinalize*/ true,
2370 /*IsCancellable*/ true);
2371}
2372
2373static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2375 IT++;
2376 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2377}
2378
2379Value *OpenMPIRBuilder::getGPUThreadID() {
2380 return Builder.CreateCall(
2381 getOrCreateRuntimeFunction(M,
2382 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2383 {});
2384}
2385
2386Value *OpenMPIRBuilder::getGPUWarpSize() {
2387 return Builder.CreateCall(
2388 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2389}
2390
2391Value *OpenMPIRBuilder::getNVPTXWarpID() {
2392 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2393 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2394}
2395
2396Value *OpenMPIRBuilder::getNVPTXLaneID() {
2397 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2398 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2399 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2400 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2401 "nvptx_lane_id");
2402}
2403
2404Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2405 Type *ToType) {
2406 Type *FromType = From->getType();
2407 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2408 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2409 assert(FromSize > 0 && "From size must be greater than zero");
2410 assert(ToSize > 0 && "To size must be greater than zero");
2411 if (FromType == ToType)
2412 return From;
2413 if (FromSize == ToSize)
2414 return Builder.CreateBitCast(From, ToType);
2415 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2416 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2417 InsertPointTy SaveIP = Builder.saveIP();
2418 Builder.restoreIP(AllocaIP);
2419 Value *CastItem = Builder.CreateAlloca(ToType);
2420 Builder.restoreIP(SaveIP);
2421
2422 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2423 CastItem, Builder.getPtrTy(0));
2424 Builder.CreateStore(From, ValCastItem);
2425 return Builder.CreateLoad(ToType, CastItem);
2426}
2427
2428Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2429 Value *Element,
2430 Type *ElementType,
2431 Value *Offset) {
2432 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2433 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2434
2435 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2436 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2437 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2438 Value *WarpSize =
2439 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2440 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2441 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2442 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2443 Value *WarpSizeCast =
2444 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2445 Value *ShuffleCall =
2446 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2447 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2448}
2449
2450void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2451 Value *DstAddr, Type *ElemType,
2452 Value *Offset, Type *ReductionArrayTy) {
2453 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2454 // Create the loop over the big sized data.
2455 // ptr = (void*)Elem;
2456 // ptrEnd = (void*) Elem + 1;
2457 // Step = 8;
2458 // while (ptr + Step < ptrEnd)
2459 // shuffle((int64_t)*ptr);
2460 // Step = 4;
2461 // while (ptr + Step < ptrEnd)
2462 // shuffle((int32_t)*ptr);
2463 // ...
2464 Type *IndexTy = Builder.getIndexTy(
2465 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2466 Value *ElemPtr = DstAddr;
2467 Value *Ptr = SrcAddr;
2468 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2469 if (Size < IntSize)
2470 continue;
2471 Type *IntType = Builder.getIntNTy(IntSize * 8);
2472 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2473 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2474 Value *SrcAddrGEP =
2475 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2476 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2477 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2478
2479 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2480 if ((Size / IntSize) > 1) {
2481 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2482 SrcAddrGEP, Builder.getPtrTy());
2483 BasicBlock *PreCondBB =
2484 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2485 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2486 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2487 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2488 emitBlock(PreCondBB, CurFunc);
2489 PHINode *PhiSrc =
2490 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2491 PhiSrc->addIncoming(Ptr, CurrentBB);
2492 PHINode *PhiDest =
2493 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2494 PhiDest->addIncoming(ElemPtr, CurrentBB);
2495 Ptr = PhiSrc;
2496 ElemPtr = PhiDest;
2497 Value *PtrDiff = Builder.CreatePtrDiff(
2498 Builder.getInt8Ty(), PtrEnd,
2499 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2500 Builder.CreateCondBr(
2501 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2502 ExitBB);
2503 emitBlock(ThenBB, CurFunc);
2504 Value *Res = createRuntimeShuffleFunction(
2505 AllocaIP,
2506 Builder.CreateAlignedLoad(
2507 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2508 IntType, Offset);
2509 Builder.CreateAlignedStore(Res, ElemPtr,
2510 M.getDataLayout().getPrefTypeAlign(ElemType));
2511 Value *LocalPtr =
2512 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2513 Value *LocalElemPtr =
2514 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2515 PhiSrc->addIncoming(LocalPtr, ThenBB);
2516 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2517 emitBranch(PreCondBB);
2518 emitBlock(ExitBB, CurFunc);
2519 } else {
2520 Value *Res = createRuntimeShuffleFunction(
2521 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2522 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2523 Res->getType()->getScalarSizeInBits())
2524 Res = Builder.CreateTrunc(Res, ElemType);
2525 Builder.CreateStore(Res, ElemPtr);
2526 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2527 ElemPtr =
2528 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2529 }
2530 Size = Size % IntSize;
2531 }
2532}
2533
2534void OpenMPIRBuilder::emitReductionListCopy(
2535 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2536 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2537 CopyOptionsTy CopyOptions) {
2538 Type *IndexTy = Builder.getIndexTy(
2539 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2540 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2541
2542 // Iterates, element-by-element, through the source Reduce list and
2543 // make a copy.
2544 for (auto En : enumerate(ReductionInfos)) {
2545 const ReductionInfo &RI = En.value();
2546 Value *SrcElementAddr = nullptr;
2547 Value *DestElementAddr = nullptr;
2548 Value *DestElementPtrAddr = nullptr;
2549 // Should we shuffle in an element from a remote lane?
2550 bool ShuffleInElement = false;
2551 // Set to true to update the pointer in the dest Reduce list to a
2552 // newly created element.
2553 bool UpdateDestListPtr = false;
2554
2555 // Step 1.1: Get the address for the src element in the Reduce list.
2556 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2557 ReductionArrayTy, SrcBase,
2558 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2559 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2560
2561 // Step 1.2: Create a temporary to store the element in the destination
2562 // Reduce list.
2563 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2564 ReductionArrayTy, DestBase,
2565 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2566 switch (Action) {
2567 case CopyAction::RemoteLaneToThread: {
2568 InsertPointTy CurIP = Builder.saveIP();
2569 Builder.restoreIP(AllocaIP);
2570 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2571 ".omp.reduction.element");
2572 DestAlloca->setAlignment(
2573 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2574 DestElementAddr = DestAlloca;
2575 DestElementAddr =
2576 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2577 DestElementAddr->getName() + ".ascast");
2578 Builder.restoreIP(CurIP);
2579 ShuffleInElement = true;
2580 UpdateDestListPtr = true;
2581 break;
2582 }
2583 case CopyAction::ThreadCopy: {
2584 DestElementAddr =
2585 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2586 break;
2587 }
2588 }
2589
2590 // Now that all active lanes have read the element in the
2591 // Reduce list, shuffle over the value from the remote lane.
2592 if (ShuffleInElement) {
2593 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2594 RemoteLaneOffset, ReductionArrayTy);
2595 } else {
2596 switch (RI.EvaluationKind) {
2597 case EvalKind::Scalar: {
2598 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2599 // Store the source element value to the dest element address.
2600 Builder.CreateStore(Elem, DestElementAddr);
2601 break;
2602 }
2603 case EvalKind::Complex: {
2604 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2605 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2606 Value *SrcReal = Builder.CreateLoad(
2607 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2608 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2609 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2610 Value *SrcImg = Builder.CreateLoad(
2611 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2612
2613 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2614 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2615 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2616 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2617 Builder.CreateStore(SrcReal, DestRealPtr);
2618 Builder.CreateStore(SrcImg, DestImgPtr);
2619 break;
2620 }
2621 case EvalKind::Aggregate: {
2622 Value *SizeVal = Builder.getInt64(
2623 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2624 Builder.CreateMemCpy(
2625 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2626 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SizeVal, false);
2628 break;
2629 }
2630 };
2631 }
2632
2633 // Step 3.1: Modify reference in dest Reduce list as needed.
2634 // Modifying the reference in Reduce list to point to the newly
2635 // created element. The element is live in the current function
2636 // scope and that of functions it invokes (i.e., reduce_function).
2637 // RemoteReduceData[i] = (void*)&RemoteElem
2638 if (UpdateDestListPtr) {
2639 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2640 DestElementAddr, Builder.getPtrTy(),
2641 DestElementAddr->getName() + ".ascast");
2642 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2643 }
2644 }
2645}
2646
2647Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2648 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2649 AttributeList FuncAttrs) {
2650 InsertPointTy SavedIP = Builder.saveIP();
2651 LLVMContext &Ctx = M.getContext();
2653 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2654 /* IsVarArg */ false);
2655 Function *WcFunc =
2657 "_omp_reduction_inter_warp_copy_func", &M);
2658 WcFunc->setAttributes(FuncAttrs);
2659 WcFunc->addParamAttr(0, Attribute::NoUndef);
2660 WcFunc->addParamAttr(1, Attribute::NoUndef);
2661 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2662 Builder.SetInsertPoint(EntryBB);
2663
2664 // ReduceList: thread local Reduce list.
2665 // At the stage of the computation when this function is called, partially
2666 // aggregated values reside in the first lane of every active warp.
2667 Argument *ReduceListArg = WcFunc->getArg(0);
2668 // NumWarps: number of warps active in the parallel region. This could
2669 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2670 Argument *NumWarpsArg = WcFunc->getArg(1);
2671
2672 // This array is used as a medium to transfer, one reduce element at a time,
2673 // the data from the first lane of every warp to lanes in the first warp
2674 // in order to perform the final step of a reduction in a parallel region
2675 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2676 // for reduced latency, as well as to have a distinct copy for concurrently
2677 // executing target regions. The array is declared with common linkage so
2678 // as to be shared across compilation units.
2679 StringRef TransferMediumName =
2680 "__openmp_nvptx_data_transfer_temporary_storage";
2681 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2682 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2683 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2684 if (!TransferMedium) {
2685 TransferMedium = new GlobalVariable(
2686 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2687 UndefValue::get(ArrayTy), TransferMediumName,
2688 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2689 /*AddressSpace=*/3);
2690 }
2691
2692 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2693 Value *GPUThreadID = getGPUThreadID();
2694 // nvptx_lane_id = nvptx_id % warpsize
2695 Value *LaneID = getNVPTXLaneID();
2696 // nvptx_warp_id = nvptx_id / warpsize
2697 Value *WarpID = getNVPTXWarpID();
2698
2699 InsertPointTy AllocaIP =
2700 InsertPointTy(Builder.GetInsertBlock(),
2701 Builder.GetInsertBlock()->getFirstInsertionPt());
2702 Type *Arg0Type = ReduceListArg->getType();
2703 Type *Arg1Type = NumWarpsArg->getType();
2704 Builder.restoreIP(AllocaIP);
2705 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2706 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2707 AllocaInst *NumWarpsAlloca =
2708 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2709 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2710 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2711 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2712 NumWarpsAlloca, Builder.getPtrTy(0),
2713 NumWarpsAlloca->getName() + ".ascast");
2714 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2715 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2716 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2717 InsertPointTy CodeGenIP =
2718 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2719 Builder.restoreIP(CodeGenIP);
2720
2721 Value *ReduceList =
2722 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2723
2724 for (auto En : enumerate(ReductionInfos)) {
2725 //
2726 // Warp master copies reduce element to transfer medium in __shared__
2727 // memory.
2728 //
2729 const ReductionInfo &RI = En.value();
2730 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2731 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2732 Type *CType = Builder.getIntNTy(TySize * 8);
2733
2734 unsigned NumIters = RealTySize / TySize;
2735 if (NumIters == 0)
2736 continue;
2737 Value *Cnt = nullptr;
2738 Value *CntAddr = nullptr;
2739 BasicBlock *PrecondBB = nullptr;
2740 BasicBlock *ExitBB = nullptr;
2741 if (NumIters > 1) {
2742 CodeGenIP = Builder.saveIP();
2743 Builder.restoreIP(AllocaIP);
2744 CntAddr =
2745 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2746
2747 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2748 CntAddr->getName() + ".ascast");
2749 Builder.restoreIP(CodeGenIP);
2750 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2751 CntAddr,
2752 /*Volatile=*/false);
2753 PrecondBB = BasicBlock::Create(Ctx, "precond");
2754 ExitBB = BasicBlock::Create(Ctx, "exit");
2755 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2756 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2757 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2758 /*Volatile=*/false);
2759 Value *Cmp = Builder.CreateICmpULT(
2760 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2761 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2762 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2763 }
2764
2765 // kmpc_barrier.
2766 InsertPointOrErrorTy BarrierIP1 =
2767 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2768 omp::Directive::OMPD_unknown,
2769 /* ForceSimpleCall */ false,
2770 /* CheckCancelFlag */ true);
2771 if (!BarrierIP1)
2772 return BarrierIP1.takeError();
2773 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2774 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2775 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2776
2777 // if (lane_id == 0)
2778 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2779 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2780 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2781
2782 // Reduce element = LocalReduceList[i]
2783 auto *RedListArrayTy =
2784 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2785 Type *IndexTy = Builder.getIndexTy(
2786 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2787 Value *ElemPtrPtr =
2788 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2789 {ConstantInt::get(IndexTy, 0),
2790 ConstantInt::get(IndexTy, En.index())});
2791 // elemptr = ((CopyType*)(elemptrptr)) + I
2792 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2793 if (NumIters > 1)
2794 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2795
2796 // Get pointer to location in transfer medium.
2797 // MediumPtr = &medium[warp_id]
2798 Value *MediumPtr = Builder.CreateInBoundsGEP(
2799 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2800 // elem = *elemptr
2801 //*MediumPtr = elem
2802 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2803 // Store the source element value to the dest element address.
2804 Builder.CreateStore(Elem, MediumPtr,
2805 /*IsVolatile*/ true);
2806 Builder.CreateBr(MergeBB);
2807
2808 // else
2809 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2810 Builder.CreateBr(MergeBB);
2811
2812 // endif
2813 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2814 InsertPointOrErrorTy BarrierIP2 =
2815 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2816 omp::Directive::OMPD_unknown,
2817 /* ForceSimpleCall */ false,
2818 /* CheckCancelFlag */ true);
2819 if (!BarrierIP2)
2820 return BarrierIP2.takeError();
2821
2822 // Warp 0 copies reduce element from transfer medium
2823 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2824 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2825 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2826
2827 Value *NumWarpsVal =
2828 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2829 // Up to 32 threads in warp 0 are active.
2830 Value *IsActiveThread =
2831 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2832 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2833
2834 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2835
2836 // SecMediumPtr = &medium[tid]
2837 // SrcMediumVal = *SrcMediumPtr
2838 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2839 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2840 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2841 Value *TargetElemPtrPtr =
2842 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2843 {ConstantInt::get(IndexTy, 0),
2844 ConstantInt::get(IndexTy, En.index())});
2845 Value *TargetElemPtrVal =
2846 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2847 Value *TargetElemPtr = TargetElemPtrVal;
2848 if (NumIters > 1)
2849 TargetElemPtr =
2850 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2851
2852 // *TargetElemPtr = SrcMediumVal;
2853 Value *SrcMediumValue =
2854 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2855 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2856 Builder.CreateBr(W0MergeBB);
2857
2858 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2859 Builder.CreateBr(W0MergeBB);
2860
2861 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2862
2863 if (NumIters > 1) {
2864 Cnt = Builder.CreateNSWAdd(
2865 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2866 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2867
2868 auto *CurFn = Builder.GetInsertBlock()->getParent();
2869 emitBranch(PrecondBB);
2870 emitBlock(ExitBB, CurFn);
2871 }
2872 RealTySize %= TySize;
2873 }
2874 }
2875
2876 Builder.CreateRetVoid();
2877 Builder.restoreIP(SavedIP);
2878
2879 return WcFunc;
2880}
2881
2882Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2883 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2884 AttributeList FuncAttrs) {
2885 LLVMContext &Ctx = M.getContext();
2886 FunctionType *FuncTy =
2887 FunctionType::get(Builder.getVoidTy(),
2888 {Builder.getPtrTy(), Builder.getInt16Ty(),
2889 Builder.getInt16Ty(), Builder.getInt16Ty()},
2890 /* IsVarArg */ false);
2891 Function *SarFunc =
2893 "_omp_reduction_shuffle_and_reduce_func", &M);
2894 SarFunc->setAttributes(FuncAttrs);
2895 SarFunc->addParamAttr(0, Attribute::NoUndef);
2896 SarFunc->addParamAttr(1, Attribute::NoUndef);
2897 SarFunc->addParamAttr(2, Attribute::NoUndef);
2898 SarFunc->addParamAttr(3, Attribute::NoUndef);
2899 SarFunc->addParamAttr(1, Attribute::SExt);
2900 SarFunc->addParamAttr(2, Attribute::SExt);
2901 SarFunc->addParamAttr(3, Attribute::SExt);
2902 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2903 Builder.SetInsertPoint(EntryBB);
2904
2905 // Thread local Reduce list used to host the values of data to be reduced.
2906 Argument *ReduceListArg = SarFunc->getArg(0);
2907 // Current lane id; could be logical.
2908 Argument *LaneIDArg = SarFunc->getArg(1);
2909 // Offset of the remote source lane relative to the current lane.
2910 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2911 // Algorithm version. This is expected to be known at compile time.
2912 Argument *AlgoVerArg = SarFunc->getArg(3);
2913
2914 Type *ReduceListArgType = ReduceListArg->getType();
2915 Type *LaneIDArgType = LaneIDArg->getType();
2916 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2917 Value *ReduceListAlloca = Builder.CreateAlloca(
2918 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2919 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2920 LaneIDArg->getName() + ".addr");
2921 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2922 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2923 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2924 AlgoVerArg->getName() + ".addr");
2925 ArrayType *RedListArrayTy =
2926 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2927
2928 // Create a local thread-private variable to host the Reduce list
2929 // from a remote lane.
2930 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2931 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2932
2933 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2934 ReduceListAlloca, ReduceListArgType,
2935 ReduceListAlloca->getName() + ".ascast");
2936 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2937 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2938 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2939 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2940 RemoteLaneOffsetAlloca->getName() + ".ascast");
2941 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2942 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2943 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2944 RemoteReductionListAlloca, Builder.getPtrTy(),
2945 RemoteReductionListAlloca->getName() + ".ascast");
2946
2947 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2948 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2949 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2950 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2951
2952 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2953 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2954 Value *RemoteLaneOffset =
2955 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2956 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2957
2958 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2959
2960 // This loop iterates through the list of reduce elements and copies,
2961 // element by element, from a remote lane in the warp to RemoteReduceList,
2962 // hosted on the thread's stack.
2963 emitReductionListCopy(
2964 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2965 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2966
2967 // The actions to be performed on the Remote Reduce list is dependent
2968 // on the algorithm version.
2969 //
2970 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2971 // LaneId % 2 == 0 && Offset > 0):
2972 // do the reduction value aggregation
2973 //
2974 // The thread local variable Reduce list is mutated in place to host the
2975 // reduced data, which is the aggregated value produced from local and
2976 // remote lanes.
2977 //
2978 // Note that AlgoVer is expected to be a constant integer known at compile
2979 // time.
2980 // When AlgoVer==0, the first conjunction evaluates to true, making
2981 // the entire predicate true during compile time.
2982 // When AlgoVer==1, the second conjunction has only the second part to be
2983 // evaluated during runtime. Other conjunctions evaluates to false
2984 // during compile time.
2985 // When AlgoVer==2, the third conjunction has only the second part to be
2986 // evaluated during runtime. Other conjunctions evaluates to false
2987 // during compile time.
2988 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2989 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2990 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2991 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2992 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2993 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2994 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2995 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2996 Value *RemoteOffsetComp =
2997 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2998 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2999 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3000 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3001
3002 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3003 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3004 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3005
3006 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3007 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3008 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3009 ReduceList, Builder.getPtrTy());
3010 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3011 RemoteListAddrCast, Builder.getPtrTy());
3012 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3013 ->addFnAttr(Attribute::NoUnwind);
3014 Builder.CreateBr(MergeBB);
3015
3016 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3017 Builder.CreateBr(MergeBB);
3018
3019 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3020
3021 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3022 // Reduce list.
3023 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3024 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3025 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3026
3027 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3028 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3029 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3030 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3031
3032 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3033 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3034 ReductionInfos, RemoteListAddrCast, ReduceList);
3035 Builder.CreateBr(CpyMergeBB);
3036
3037 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3038 Builder.CreateBr(CpyMergeBB);
3039
3040 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3041
3042 Builder.CreateRetVoid();
3043
3044 return SarFunc;
3045}
3046
3047Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3048 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3049 AttributeList FuncAttrs) {
3050 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3051 LLVMContext &Ctx = M.getContext();
3053 Builder.getVoidTy(),
3054 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3055 /* IsVarArg */ false);
3056 Function *LtGCFunc =
3058 "_omp_reduction_list_to_global_copy_func", &M);
3059 LtGCFunc->setAttributes(FuncAttrs);
3060 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3061 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3063
3064 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3065 Builder.SetInsertPoint(EntryBlock);
3066
3067 // Buffer: global reduction buffer.
3068 Argument *BufferArg = LtGCFunc->getArg(0);
3069 // Idx: index of the buffer.
3070 Argument *IdxArg = LtGCFunc->getArg(1);
3071 // ReduceList: thread local Reduce list.
3072 Argument *ReduceListArg = LtGCFunc->getArg(2);
3073
3074 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3075 BufferArg->getName() + ".addr");
3076 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3077 IdxArg->getName() + ".addr");
3078 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3079 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3080 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3081 BufferArgAlloca, Builder.getPtrTy(),
3082 BufferArgAlloca->getName() + ".ascast");
3083 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3084 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3085 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3086 ReduceListArgAlloca, Builder.getPtrTy(),
3087 ReduceListArgAlloca->getName() + ".ascast");
3088
3089 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3090 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3091 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3092
3093 Value *LocalReduceList =
3094 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3095 Value *BufferArgVal =
3096 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3097 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3098 Type *IndexTy = Builder.getIndexTy(
3099 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3100 for (auto En : enumerate(ReductionInfos)) {
3101 const ReductionInfo &RI = En.value();
3102 auto *RedListArrayTy =
3103 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3104 // Reduce element = LocalReduceList[i]
3105 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3106 RedListArrayTy, LocalReduceList,
3107 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3108 // elemptr = ((CopyType*)(elemptrptr)) + I
3109 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3110
3111 // Global = Buffer.VD[Idx];
3112 Value *BufferVD =
3113 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3114 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3115 ReductionsBufferTy, BufferVD, 0, En.index());
3116
3117 switch (RI.EvaluationKind) {
3118 case EvalKind::Scalar: {
3119 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3120 Builder.CreateStore(TargetElement, GlobVal);
3121 break;
3122 }
3123 case EvalKind::Complex: {
3124 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3125 RI.ElementType, ElemPtr, 0, 0, ".realp");
3126 Value *SrcReal = Builder.CreateLoad(
3127 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3128 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3129 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3130 Value *SrcImg = Builder.CreateLoad(
3131 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3132
3133 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3134 RI.ElementType, GlobVal, 0, 0, ".realp");
3135 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3136 RI.ElementType, GlobVal, 0, 1, ".imagp");
3137 Builder.CreateStore(SrcReal, DestRealPtr);
3138 Builder.CreateStore(SrcImg, DestImgPtr);
3139 break;
3140 }
3141 case EvalKind::Aggregate: {
3142 Value *SizeVal =
3143 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3144 Builder.CreateMemCpy(
3145 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3146 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3147 break;
3148 }
3149 }
3150 }
3151
3152 Builder.CreateRetVoid();
3153 Builder.restoreIP(OldIP);
3154 return LtGCFunc;
3155}
3156
3157Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3158 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3159 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3160 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3161 LLVMContext &Ctx = M.getContext();
3163 Builder.getVoidTy(),
3164 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3165 /* IsVarArg */ false);
3166 Function *LtGRFunc =
3168 "_omp_reduction_list_to_global_reduce_func", &M);
3169 LtGRFunc->setAttributes(FuncAttrs);
3170 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3171 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3173
3174 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3175 Builder.SetInsertPoint(EntryBlock);
3176
3177 // Buffer: global reduction buffer.
3178 Argument *BufferArg = LtGRFunc->getArg(0);
3179 // Idx: index of the buffer.
3180 Argument *IdxArg = LtGRFunc->getArg(1);
3181 // ReduceList: thread local Reduce list.
3182 Argument *ReduceListArg = LtGRFunc->getArg(2);
3183
3184 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3185 BufferArg->getName() + ".addr");
3186 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3187 IdxArg->getName() + ".addr");
3188 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3189 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3190 auto *RedListArrayTy =
3191 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3192
3193 // 1. Build a list of reduction variables.
3194 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3195 Value *LocalReduceList =
3196 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3197
3198 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3199 BufferArgAlloca, Builder.getPtrTy(),
3200 BufferArgAlloca->getName() + ".ascast");
3201 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3202 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3203 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3204 ReduceListArgAlloca, Builder.getPtrTy(),
3205 ReduceListArgAlloca->getName() + ".ascast");
3206 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3207 LocalReduceList, Builder.getPtrTy(),
3208 LocalReduceList->getName() + ".ascast");
3209
3210 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3211 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3212 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3213
3214 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3215 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3216 Type *IndexTy = Builder.getIndexTy(
3217 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3218 for (auto En : enumerate(ReductionInfos)) {
3219 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3220 RedListArrayTy, LocalReduceListAddrCast,
3221 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3222 Value *BufferVD =
3223 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3224 // Global = Buffer.VD[Idx];
3225 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3226 ReductionsBufferTy, BufferVD, 0, En.index());
3227 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3228 }
3229
3230 // Call reduce_function(GlobalReduceList, ReduceList)
3231 Value *ReduceList =
3232 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3233 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3234 ->addFnAttr(Attribute::NoUnwind);
3235 Builder.CreateRetVoid();
3236 Builder.restoreIP(OldIP);
3237 return LtGRFunc;
3238}
3239
3240Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3241 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3242 AttributeList FuncAttrs) {
3243 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3244 LLVMContext &Ctx = M.getContext();
3246 Builder.getVoidTy(),
3247 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3248 /* IsVarArg */ false);
3249 Function *LtGCFunc =
3251 "_omp_reduction_global_to_list_copy_func", &M);
3252 LtGCFunc->setAttributes(FuncAttrs);
3253 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3254 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3256
3257 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3258 Builder.SetInsertPoint(EntryBlock);
3259
3260 // Buffer: global reduction buffer.
3261 Argument *BufferArg = LtGCFunc->getArg(0);
3262 // Idx: index of the buffer.
3263 Argument *IdxArg = LtGCFunc->getArg(1);
3264 // ReduceList: thread local Reduce list.
3265 Argument *ReduceListArg = LtGCFunc->getArg(2);
3266
3267 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3268 BufferArg->getName() + ".addr");
3269 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3270 IdxArg->getName() + ".addr");
3271 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3272 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3273 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3274 BufferArgAlloca, Builder.getPtrTy(),
3275 BufferArgAlloca->getName() + ".ascast");
3276 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3277 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3278 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3279 ReduceListArgAlloca, Builder.getPtrTy(),
3280 ReduceListArgAlloca->getName() + ".ascast");
3281 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3282 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3283 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3284
3285 Value *LocalReduceList =
3286 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3287 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3288 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3289 Type *IndexTy = Builder.getIndexTy(
3290 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3291 for (auto En : enumerate(ReductionInfos)) {
3292 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3293 auto *RedListArrayTy =
3294 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3295 // Reduce element = LocalReduceList[i]
3296 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3297 RedListArrayTy, LocalReduceList,
3298 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3299 // elemptr = ((CopyType*)(elemptrptr)) + I
3300 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3301 // Global = Buffer.VD[Idx];
3302 Value *BufferVD =
3303 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3304 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3305 ReductionsBufferTy, BufferVD, 0, En.index());
3306
3307 switch (RI.EvaluationKind) {
3308 case EvalKind::Scalar: {
3309 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3310 Builder.CreateStore(TargetElement, ElemPtr);
3311 break;
3312 }
3313 case EvalKind::Complex: {
3314 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3315 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3316 Value *SrcReal = Builder.CreateLoad(
3317 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3318 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3319 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3320 Value *SrcImg = Builder.CreateLoad(
3321 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3322
3323 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3324 RI.ElementType, ElemPtr, 0, 0, ".realp");
3325 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3326 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3327 Builder.CreateStore(SrcReal, DestRealPtr);
3328 Builder.CreateStore(SrcImg, DestImgPtr);
3329 break;
3330 }
3331 case EvalKind::Aggregate: {
3332 Value *SizeVal =
3333 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3334 Builder.CreateMemCpy(
3335 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3336 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 SizeVal, false);
3338 break;
3339 }
3340 }
3341 }
3342
3343 Builder.CreateRetVoid();
3344 Builder.restoreIP(OldIP);
3345 return LtGCFunc;
3346}
3347
3348Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3349 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3350 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3351 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3352 LLVMContext &Ctx = M.getContext();
3353 auto *FuncTy = FunctionType::get(
3354 Builder.getVoidTy(),
3355 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3356 /* IsVarArg */ false);
3357 Function *LtGRFunc =
3359 "_omp_reduction_global_to_list_reduce_func", &M);
3360 LtGRFunc->setAttributes(FuncAttrs);
3361 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3362 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3364
3365 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3366 Builder.SetInsertPoint(EntryBlock);
3367
3368 // Buffer: global reduction buffer.
3369 Argument *BufferArg = LtGRFunc->getArg(0);
3370 // Idx: index of the buffer.
3371 Argument *IdxArg = LtGRFunc->getArg(1);
3372 // ReduceList: thread local Reduce list.
3373 Argument *ReduceListArg = LtGRFunc->getArg(2);
3374
3375 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3376 BufferArg->getName() + ".addr");
3377 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3378 IdxArg->getName() + ".addr");
3379 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3380 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3381 ArrayType *RedListArrayTy =
3382 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3383
3384 // 1. Build a list of reduction variables.
3385 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3386 Value *LocalReduceList =
3387 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3388
3389 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3390 BufferArgAlloca, Builder.getPtrTy(),
3391 BufferArgAlloca->getName() + ".ascast");
3392 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3393 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3394 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3395 ReduceListArgAlloca, Builder.getPtrTy(),
3396 ReduceListArgAlloca->getName() + ".ascast");
3397 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3398 LocalReduceList, Builder.getPtrTy(),
3399 LocalReduceList->getName() + ".ascast");
3400
3401 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3402 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3403 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3404
3405 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3406 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3407 Type *IndexTy = Builder.getIndexTy(
3408 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3409 for (auto En : enumerate(ReductionInfos)) {
3410 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3411 RedListArrayTy, ReductionList,
3412 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3413 // Global = Buffer.VD[Idx];
3414 Value *BufferVD =
3415 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3416 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3417 ReductionsBufferTy, BufferVD, 0, En.index());
3418 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3419 }
3420
3421 // Call reduce_function(ReduceList, GlobalReduceList)
3422 Value *ReduceList =
3423 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3424 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3425 ->addFnAttr(Attribute::NoUnwind);
3426 Builder.CreateRetVoid();
3427 Builder.restoreIP(OldIP);
3428 return LtGRFunc;
3429}
3430
3431std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3432 std::string Suffix =
3433 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3434 return (Name + Suffix).str();
3435}
3436
3437Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3438 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3439 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3440 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3441 {Builder.getPtrTy(), Builder.getPtrTy()},
3442 /* IsVarArg */ false);
3443 std::string Name = getReductionFuncName(ReducerName);
3444 Function *ReductionFunc =
3446 ReductionFunc->setAttributes(FuncAttrs);
3447 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3448 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3449 BasicBlock *EntryBB =
3450 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3451 Builder.SetInsertPoint(EntryBB);
3452
3453 // Need to alloca memory here and deal with the pointers before getting
3454 // LHS/RHS pointers out
3455 Value *LHSArrayPtr = nullptr;
3456 Value *RHSArrayPtr = nullptr;
3457 Argument *Arg0 = ReductionFunc->getArg(0);
3458 Argument *Arg1 = ReductionFunc->getArg(1);
3459 Type *Arg0Type = Arg0->getType();
3460 Type *Arg1Type = Arg1->getType();
3461
3462 Value *LHSAlloca =
3463 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3464 Value *RHSAlloca =
3465 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3466 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3467 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3468 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3469 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3470 Builder.CreateStore(Arg0, LHSAddrCast);
3471 Builder.CreateStore(Arg1, RHSAddrCast);
3472 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3473 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3474
3475 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3476 Type *IndexTy = Builder.getIndexTy(
3477 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3478 SmallVector<Value *> LHSPtrs, RHSPtrs;
3479 for (auto En : enumerate(ReductionInfos)) {
3480 const ReductionInfo &RI = En.value();
3481 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3482 RedArrayTy, RHSArrayPtr,
3483 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3484 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3485 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3486 RHSI8Ptr, RI.PrivateVariable->getType(),
3487 RHSI8Ptr->getName() + ".ascast");
3488
3489 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3490 RedArrayTy, LHSArrayPtr,
3491 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3492 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3493 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3494 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3495
3496 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3497 LHSPtrs.emplace_back(LHSPtr);
3498 RHSPtrs.emplace_back(RHSPtr);
3499 } else {
3500 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3501 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3502 Value *Reduced;
3503 InsertPointOrErrorTy AfterIP =
3504 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3505 if (!AfterIP)
3506 return AfterIP.takeError();
3507 if (!Builder.GetInsertBlock())
3508 return ReductionFunc;
3509 Builder.CreateStore(Reduced, LHSPtr);
3510 }
3511 }
3512
3513 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3514 for (auto En : enumerate(ReductionInfos)) {
3515 unsigned Index = En.index();
3516 const ReductionInfo &RI = En.value();
3517 Value *LHSFixupPtr, *RHSFixupPtr;
3518 Builder.restoreIP(RI.ReductionGenClang(
3519 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3520
3521 // Fix the CallBack code genereated to use the correct Values for the LHS
3522 // and RHS
3523 LHSFixupPtr->replaceUsesWithIf(
3524 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3525 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3526 ReductionFunc;
3527 });
3528 RHSFixupPtr->replaceUsesWithIf(
3529 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3530 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3531 ReductionFunc;
3532 });
3533 }
3534
3535 Builder.CreateRetVoid();
3536 return ReductionFunc;
3537}
3538
3539static void
3541 bool IsGPU) {
3542 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3543 (void)RI;
3544 assert(RI.Variable && "expected non-null variable");
3545 assert(RI.PrivateVariable && "expected non-null private variable");
3546 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3547 "expected non-null reduction generator callback");
3548 if (!IsGPU) {
3549 assert(
3550 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3551 "expected variables and their private equivalents to have the same "
3552 "type");
3553 }
3554 assert(RI.Variable->getType()->isPointerTy() &&
3555 "expected variables to be pointers");
3556 }
3557}
3558
3559OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3560 const LocationDescription &Loc, InsertPointTy AllocaIP,
3561 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3562 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3563 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3564 Value *SrcLocInfo) {
3565 if (!updateToLocation(Loc))
3566 return InsertPointTy();
3567 Builder.restoreIP(CodeGenIP);
3568 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3569 LLVMContext &Ctx = M.getContext();
3570
3571 // Source location for the ident struct
3572 if (!SrcLocInfo) {
3573 uint32_t SrcLocStrSize;
3574 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3575 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3576 }
3577
3578 if (ReductionInfos.size() == 0)
3579 return Builder.saveIP();
3580
3581 BasicBlock *ContinuationBlock = nullptr;
3582 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3583 // Copied code from createReductions
3584 BasicBlock *InsertBlock = Loc.IP.getBlock();
3585 ContinuationBlock =
3586 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3587 InsertBlock->getTerminator()->eraseFromParent();
3588 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3589 }
3590
3591 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3592 AttributeList FuncAttrs;
3593 AttrBuilder AttrBldr(Ctx);
3594 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3595 AttrBldr.addAttribute(Attr);
3596 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3597 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3598
3599 CodeGenIP = Builder.saveIP();
3600 Expected<Function *> ReductionResult =
3601 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3602 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3603 if (!ReductionResult)
3604 return ReductionResult.takeError();
3605 Function *ReductionFunc = *ReductionResult;
3606 Builder.restoreIP(CodeGenIP);
3607
3608 // Set the grid value in the config needed for lowering later on
3609 if (GridValue.has_value())
3610 Config.setGridValue(GridValue.value());
3611 else
3612 Config.setGridValue(getGridValue(T, ReductionFunc));
3613
3614 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3615 // RedList, shuffle_reduce_func, interwarp_copy_func);
3616 // or
3617 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3618 Value *Res;
3619
3620 // 1. Build a list of reduction variables.
3621 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3622 auto Size = ReductionInfos.size();
3623 Type *PtrTy = PointerType::getUnqual(Ctx);
3624 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3625 CodeGenIP = Builder.saveIP();
3626 Builder.restoreIP(AllocaIP);
3627 Value *ReductionListAlloca =
3628 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3629 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3630 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3631 Builder.restoreIP(CodeGenIP);
3632 Type *IndexTy = Builder.getIndexTy(
3633 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3634 for (auto En : enumerate(ReductionInfos)) {
3635 const ReductionInfo &RI = En.value();
3636 Value *ElemPtr = Builder.CreateInBoundsGEP(
3637 RedArrayTy, ReductionList,
3638 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3639 Value *CastElem =
3640 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3641 Builder.CreateStore(CastElem, ElemPtr);
3642 }
3643 CodeGenIP = Builder.saveIP();
3644 Function *SarFunc =
3645 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3646 Expected<Function *> CopyResult =
3647 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3648 if (!CopyResult)
3649 return CopyResult.takeError();
3650 Function *WcFunc = *CopyResult;
3651 Builder.restoreIP(CodeGenIP);
3652
3653 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3654
3655 unsigned MaxDataSize = 0;
3656 SmallVector<Type *> ReductionTypeArgs;
3657 for (auto En : enumerate(ReductionInfos)) {
3658 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3659 if (Size > MaxDataSize)
3660 MaxDataSize = Size;
3661 ReductionTypeArgs.emplace_back(En.value().ElementType);
3662 }
3663 Value *ReductionDataSize =
3664 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3665 if (!IsTeamsReduction) {
3666 Value *SarFuncCast =
3667 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
3668 Value *WcFuncCast =
3669 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
3670 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3671 WcFuncCast};
3672 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3673 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3674 Res = Builder.CreateCall(Pv2Ptr, Args);
3675 } else {
3676 CodeGenIP = Builder.saveIP();
3677 StructType *ReductionsBufferTy = StructType::create(
3678 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3679 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3680 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3681 Function *LtGCFunc = emitListToGlobalCopyFunction(
3682 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3683 Function *LtGRFunc = emitListToGlobalReduceFunction(
3684 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3685 Function *GtLCFunc = emitGlobalToListCopyFunction(
3686 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3687 Function *GtLRFunc = emitGlobalToListReduceFunction(
3688 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3689 Builder.restoreIP(CodeGenIP);
3690
3691 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3692 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3693
3694 Value *Args3[] = {SrcLocInfo,
3695 KernelTeamsReductionPtr,
3696 Builder.getInt32(ReductionBufNum),
3697 ReductionDataSize,
3698 RL,
3699 SarFunc,
3700 WcFunc,
3701 LtGCFunc,
3702 LtGRFunc,
3703 GtLCFunc,
3704 GtLRFunc};
3705
3706 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3707 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3708 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3709 }
3710
3711 // 5. Build if (res == 1)
3712 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3713 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3714 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3715 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3716
3717 // 6. Build then branch: where we have reduced values in the master
3718 // thread in each team.
3719 // __kmpc_end_reduce{_nowait}(<gtid>);
3720 // break;
3721 emitBlock(ThenBB, CurFunc);
3722
3723 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3724 for (auto En : enumerate(ReductionInfos)) {
3725 const ReductionInfo &RI = En.value();
3726 Value *LHS = RI.Variable;
3727 Value *RHS =
3728 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3729
3730 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3731 Value *LHSPtr, *RHSPtr;
3732 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3733 &LHSPtr, &RHSPtr, CurFunc));
3734
3735 // Fix the CallBack code genereated to use the correct Values for the LHS
3736 // and RHS
3737 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3738 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3739 ReductionFunc;
3740 });
3741 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3742 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3743 ReductionFunc;
3744 });
3745 } else {
3746 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3747 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3748 Value *Reduced;
3749 InsertPointOrErrorTy AfterIP =
3750 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3751 if (!AfterIP)
3752 return AfterIP.takeError();
3753 Builder.CreateStore(Reduced, LHS, false);
3754 }
3755 }
3756 emitBlock(ExitBB, CurFunc);
3757 if (ContinuationBlock) {
3758 Builder.CreateBr(ContinuationBlock);
3759 Builder.SetInsertPoint(ContinuationBlock);
3760 }
3761 Config.setEmitLLVMUsed();
3762
3763 return Builder.saveIP();
3764}
3765
3767 Type *VoidTy = Type::getVoidTy(M.getContext());
3768 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3769 auto *FuncTy =
3770 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3772 ".omp.reduction.func", &M);
3773}
3774
3776 Function *ReductionFunc,
3778 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3779 Module *Module = ReductionFunc->getParent();
3780 BasicBlock *ReductionFuncBlock =
3781 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3782 Builder.SetInsertPoint(ReductionFuncBlock);
3783 Value *LHSArrayPtr = nullptr;
3784 Value *RHSArrayPtr = nullptr;
3785 if (IsGPU) {
3786 // Need to alloca memory here and deal with the pointers before getting
3787 // LHS/RHS pointers out
3788 //
3789 Argument *Arg0 = ReductionFunc->getArg(0);
3790 Argument *Arg1 = ReductionFunc->getArg(1);
3791 Type *Arg0Type = Arg0->getType();
3792 Type *Arg1Type = Arg1->getType();
3793
3794 Value *LHSAlloca =
3795 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3796 Value *RHSAlloca =
3797 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3798 Value *LHSAddrCast =
3799 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3800 Value *RHSAddrCast =
3801 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3802 Builder.CreateStore(Arg0, LHSAddrCast);
3803 Builder.CreateStore(Arg1, RHSAddrCast);
3804 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3805 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3806 } else {
3807 LHSArrayPtr = ReductionFunc->getArg(0);
3808 RHSArrayPtr = ReductionFunc->getArg(1);
3809 }
3810
3811 unsigned NumReductions = ReductionInfos.size();
3812 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3813
3814 for (auto En : enumerate(ReductionInfos)) {
3815 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3816 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3817 RedArrayTy, LHSArrayPtr, 0, En.index());
3818 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3819 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3820 LHSI8Ptr, RI.Variable->getType());
3821 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3822 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3823 RedArrayTy, RHSArrayPtr, 0, En.index());
3824 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3825 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3826 RHSI8Ptr, RI.PrivateVariable->getType());
3827 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3828 Value *Reduced;
3829 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3830 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3831 if (!AfterIP)
3832 return AfterIP.takeError();
3833
3834 Builder.restoreIP(*AfterIP);
3835 // TODO: Consider flagging an error.
3836 if (!Builder.GetInsertBlock())
3837 return Error::success();
3838
3839 // store is inside of the reduction region when using by-ref
3840 if (!IsByRef[En.index()])
3841 Builder.CreateStore(Reduced, LHSPtr);
3842 }
3843 Builder.CreateRetVoid();
3844 return Error::success();
3845}
3846
3847OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3848 const LocationDescription &Loc, InsertPointTy AllocaIP,
3849 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3850 bool IsNoWait, bool IsTeamsReduction) {
3851 assert(ReductionInfos.size() == IsByRef.size());
3852 if (Config.isGPU())
3853 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3854 IsNoWait, IsTeamsReduction);
3855
3856 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3857
3858 if (!updateToLocation(Loc))
3859 return InsertPointTy();
3860
3861 if (ReductionInfos.size() == 0)
3862 return Builder.saveIP();
3863
3864 BasicBlock *InsertBlock = Loc.IP.getBlock();
3865 BasicBlock *ContinuationBlock =
3866 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3867 InsertBlock->getTerminator()->eraseFromParent();
3868
3869 // Create and populate array of type-erased pointers to private reduction
3870 // values.
3871 unsigned NumReductions = ReductionInfos.size();
3872 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3873 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3874 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3875
3876 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3877
3878 for (auto En : enumerate(ReductionInfos)) {
3879 unsigned Index = En.index();
3880 const ReductionInfo &RI = En.value();
3881 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3882 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3883 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3884 }
3885
3886 // Emit a call to the runtime function that orchestrates the reduction.
3887 // Declare the reduction function in the process.
3888 Type *IndexTy = Builder.getIndexTy(
3889 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3890 Function *Func = Builder.GetInsertBlock()->getParent();
3891 Module *Module = Func->getParent();
3892 uint32_t SrcLocStrSize;
3893 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3894 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3895 return RI.AtomicReductionGen;
3896 });
3897 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3898 CanGenerateAtomic
3899 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3900 : IdentFlag(0));
3901 Value *ThreadId = getOrCreateThreadID(Ident);
3902 Constant *NumVariables = Builder.getInt32(NumReductions);
3903 const DataLayout &DL = Module->getDataLayout();
3904 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3905 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3906 Function *ReductionFunc = getFreshReductionFunc(*Module);
3907 Value *Lock = getOMPCriticalRegionLock(".reduction");
3908 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3909 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3910 : RuntimeFunction::OMPRTL___kmpc_reduce);
3911 CallInst *ReduceCall =
3912 Builder.CreateCall(ReduceFunc,
3913 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3914 ReductionFunc, Lock},
3915 "reduce");
3916
3917 // Create final reduction entry blocks for the atomic and non-atomic case.
3918 // Emit IR that dispatches control flow to one of the blocks based on the
3919 // reduction supporting the atomic mode.
3920 BasicBlock *NonAtomicRedBlock =
3921 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3922 BasicBlock *AtomicRedBlock =
3923 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3925 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3926 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3927 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3928
3929 // Populate the non-atomic reduction using the elementwise reduction function.
3930 // This loads the elements from the global and private variables and reduces
3931 // them before storing back the result to the global variable.
3932 Builder.SetInsertPoint(NonAtomicRedBlock);
3933 for (auto En : enumerate(ReductionInfos)) {
3934 const ReductionInfo &RI = En.value();
3935 Type *ValueType = RI.ElementType;
3936 // We have one less load for by-ref case because that load is now inside of
3937 // the reduction region
3938 Value *RedValue = RI.Variable;
3939 if (!IsByRef[En.index()]) {
3940 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3941 "red.value." + Twine(En.index()));
3942 }
3943 Value *PrivateRedValue =
3944 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3945 "red.private.value." + Twine(En.index()));
3946 Value *Reduced;
3947 InsertPointOrErrorTy AfterIP =
3948 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3949 if (!AfterIP)
3950 return AfterIP.takeError();
3951 Builder.restoreIP(*AfterIP);
3952
3953 if (!Builder.GetInsertBlock())
3954 return InsertPointTy();
3955 // for by-ref case, the load is inside of the reduction region
3956 if (!IsByRef[En.index()])
3957 Builder.CreateStore(Reduced, RI.Variable);
3958 }
3959 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3960 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3961 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3962 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3963 Builder.CreateBr(ContinuationBlock);
3964
3965 // Populate the atomic reduction using the atomic elementwise reduction
3966 // function. There are no loads/stores here because they will be happening
3967 // inside the atomic elementwise reduction.
3968 Builder.SetInsertPoint(AtomicRedBlock);
3969 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3970 for (const ReductionInfo &RI : ReductionInfos) {
3971 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3972 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3973 if (!AfterIP)
3974 return AfterIP.takeError();
3975 Builder.restoreIP(*AfterIP);
3976 if (!Builder.GetInsertBlock())
3977 return InsertPointTy();
3978 }
3979 Builder.CreateBr(ContinuationBlock);
3980 } else {
3981 Builder.CreateUnreachable();
3982 }
3983
3984 // Populate the outlined reduction function using the elementwise reduction
3985 // function. Partial values are extracted from the type-erased array of
3986 // pointers to private variables.
3987 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3988 IsByRef, /*isGPU=*/false);
3989 if (Err)
3990 return Err;
3991
3992 if (!Builder.GetInsertBlock())
3993 return InsertPointTy();
3994
3995 Builder.SetInsertPoint(ContinuationBlock);
3996 return Builder.saveIP();
3997}
3998
3999OpenMPIRBuilder::InsertPointOrErrorTy
4000OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4001 BodyGenCallbackTy BodyGenCB,
4002 FinalizeCallbackTy FiniCB) {
4003 if (!updateToLocation(Loc))
4004 return Loc.IP;
4005
4006 Directive OMPD = Directive::OMPD_master;
4007 uint32_t SrcLocStrSize;
4008 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4009 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4010 Value *ThreadId = getOrCreateThreadID(Ident);
4011 Value *Args[] = {Ident, ThreadId};
4012
4013 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4014 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4015
4016 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4017 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4018
4019 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4020 /*Conditional*/ true, /*hasFinalize*/ true);
4021}
4022
4023OpenMPIRBuilder::InsertPointOrErrorTy
4024OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4025 BodyGenCallbackTy BodyGenCB,
4026 FinalizeCallbackTy FiniCB, Value *Filter) {
4027 if (!updateToLocation(Loc))
4028 return Loc.IP;
4029
4030 Directive OMPD = Directive::OMPD_masked;
4031 uint32_t SrcLocStrSize;
4032 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4033 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4034 Value *ThreadId = getOrCreateThreadID(Ident);
4035 Value *Args[] = {Ident, ThreadId, Filter};
4036 Value *ArgsEnd[] = {Ident, ThreadId};
4037
4038 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4039 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4040
4041 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4042 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4043
4044 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4045 /*Conditional*/ true, /*hasFinalize*/ true);
4046}
4047
4049 llvm::FunctionCallee Callee,
4051 const llvm::Twine &Name) {
4052 llvm::CallInst *Call = Builder.CreateCall(
4053 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4054 Call->setDoesNotThrow();
4055 return Call;
4056}
4057
4058// Expects input basic block is dominated by BeforeScanBB.
4059// Once Scan directive is encountered, the code after scan directive should be
4060// dominated by AfterScanBB. Scan directive splits the code sequence to
4061// scan and input phase. Based on whether inclusive or exclusive
4062// clause is used in the scan directive and whether input loop or scan loop
4063// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4064// input loop and second is the scan loop. The code generated handles only
4065// inclusive scans now.
4066OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4067 const LocationDescription &Loc, InsertPointTy AllocaIP,
4068 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4069 bool IsInclusive, ScanInfo *ScanRedInfo) {
4070 if (ScanRedInfo->OMPFirstScanLoop) {
4071 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4072 ScanVarsType, ScanRedInfo);
4073 if (Err)
4074 return Err;
4075 }
4076 if (!updateToLocation(Loc))
4077 return Loc.IP;
4078
4079 llvm::Value *IV = ScanRedInfo->IV;
4080
4081 if (ScanRedInfo->OMPFirstScanLoop) {
4082 // Emit buffer[i] = red; at the end of the input phase.
4083 for (size_t i = 0; i < ScanVars.size(); i++) {
4084 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4085 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4086 Type *DestTy = ScanVarsType[i];
4087 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4088 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4089
4090 Builder.CreateStore(Src, Val);
4091 }
4092 }
4093 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4094 emitBlock(ScanRedInfo->OMPScanDispatch,
4095 Builder.GetInsertBlock()->getParent());
4096
4097 if (!ScanRedInfo->OMPFirstScanLoop) {
4098 IV = ScanRedInfo->IV;
4099 // Emit red = buffer[i]; at the entrance to the scan phase.
4100 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4101 for (size_t i = 0; i < ScanVars.size(); i++) {
4102 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4103 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4104 Type *DestTy = ScanVarsType[i];
4105 Value *SrcPtr =
4106 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4107 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4108 Builder.CreateStore(Src, ScanVars[i]);
4109 }
4110 }
4111
4112 // TODO: Update it to CreateBr and remove dead blocks
4113 llvm::Value *CmpI = Builder.getInt1(true);
4114 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4115 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4116 ScanRedInfo->OMPAfterScanBlock);
4117 } else {
4118 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4119 ScanRedInfo->OMPBeforeScanBlock);
4120 }
4121 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4122 Builder.GetInsertBlock()->getParent());
4123 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4124 return Builder.saveIP();
4125}
4126
4127Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4128 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4129 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4130
4131 Builder.restoreIP(AllocaIP);
4132 // Create the shared pointer at alloca IP.
4133 for (size_t i = 0; i < ScanVars.size(); i++) {
4134 llvm::Value *BuffPtr =
4135 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4136 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4137 }
4138
4139 // Allocate temporary buffer by master thread
4140 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4141 InsertPointTy CodeGenIP) -> Error {
4142 Builder.restoreIP(CodeGenIP);
4143 Value *AllocSpan =
4144 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4145 for (size_t i = 0; i < ScanVars.size(); i++) {
4146 Type *IntPtrTy = Builder.getInt32Ty();
4147 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4148 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4149 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4150 AllocSpan, nullptr, "arr");
4151 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4152 }
4153 return Error::success();
4154 };
4155 // TODO: Perform finalization actions for variables. This has to be
4156 // called for variables which have destructors/finalizers.
4157 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4158
4159 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4160 llvm::Value *FilterVal = Builder.getInt32(0);
4161 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4162 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4163
4164 if (!AfterIP)
4165 return AfterIP.takeError();
4166 Builder.restoreIP(*AfterIP);
4167 BasicBlock *InputBB = Builder.GetInsertBlock();
4168 if (InputBB->getTerminator())
4169 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4170 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4171 if (!AfterIP)
4172 return AfterIP.takeError();
4173 Builder.restoreIP(*AfterIP);
4174
4175 return Error::success();
4176}
4177
4178Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4179 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4180 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4181 InsertPointTy CodeGenIP) -> Error {
4182 Builder.restoreIP(CodeGenIP);
4183 for (ReductionInfo RedInfo : ReductionInfos) {
4184 Value *PrivateVar = RedInfo.PrivateVariable;
4185 Value *OrigVar = RedInfo.Variable;
4186 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4187 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4188
4189 Type *SrcTy = RedInfo.ElementType;
4190 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4191 "arrayOffset");
4192 Value *Src = Builder.CreateLoad(SrcTy, Val);
4193
4194 Builder.CreateStore(Src, OrigVar);
4195 Builder.CreateFree(Buff);
4196 }
4197 return Error::success();
4198 };
4199 // TODO: Perform finalization actions for variables. This has to be
4200 // called for variables which have destructors/finalizers.
4201 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4202
4203 if (ScanRedInfo->OMPScanFinish->getTerminator())
4204 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4205 else
4206 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4207
4208 llvm::Value *FilterVal = Builder.getInt32(0);
4209 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4210 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4211
4212 if (!AfterIP)
4213 return AfterIP.takeError();
4214 Builder.restoreIP(*AfterIP);
4215 BasicBlock *InputBB = Builder.GetInsertBlock();
4216 if (InputBB->getTerminator())
4217 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4218 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4219 if (!AfterIP)
4220 return AfterIP.takeError();
4221 Builder.restoreIP(*AfterIP);
4222 return Error::success();
4223}
4224
4225OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4226 const LocationDescription &Loc,
4228 ScanInfo *ScanRedInfo) {
4229
4230 if (!updateToLocation(Loc))
4231 return Loc.IP;
4232 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4233 InsertPointTy CodeGenIP) -> Error {
4234 Builder.restoreIP(CodeGenIP);
4235 Function *CurFn = Builder.GetInsertBlock()->getParent();
4236 // for (int k = 0; k <= ceil(log2(n)); ++k)
4237 llvm::BasicBlock *LoopBB =
4238 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4239 llvm::BasicBlock *ExitBB =
4240 splitBB(Builder, false, "omp.outer.log.scan.exit");
4242 Builder.GetInsertBlock()->getModule(),
4243 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4244 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4245 llvm::Value *Arg =
4246 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4247 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4249 Builder.GetInsertBlock()->getModule(),
4250 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4251 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4252 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4253 llvm::Value *NMin1 = Builder.CreateNUWSub(
4254 ScanRedInfo->Span,
4255 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4256 Builder.SetInsertPoint(InputBB);
4257 Builder.CreateBr(LoopBB);
4258 emitBlock(LoopBB, CurFn);
4259 Builder.SetInsertPoint(LoopBB);
4260
4261 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4262 // size pow2k = 1;
4263 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4264 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4265 InputBB);
4266 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4267 InputBB);
4268 // for (size i = n - 1; i >= 2 ^ k; --i)
4269 // tmp[i] op= tmp[i-pow2k];
4270 llvm::BasicBlock *InnerLoopBB =
4271 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4272 llvm::BasicBlock *InnerExitBB =
4273 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4274 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4275 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4276 emitBlock(InnerLoopBB, CurFn);
4277 Builder.SetInsertPoint(InnerLoopBB);
4278 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4279 IVal->addIncoming(NMin1, LoopBB);
4280 for (ReductionInfo RedInfo : ReductionInfos) {
4281 Value *ReductionVal = RedInfo.PrivateVariable;
4282 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4283 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4284 Type *DestTy = RedInfo.ElementType;
4285 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4286 Value *LHSPtr =
4287 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4288 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4289 Value *RHSPtr =
4290 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4291 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4292 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4294 InsertPointOrErrorTy AfterIP =
4295 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4296 if (!AfterIP)
4297 return AfterIP.takeError();
4298 Builder.CreateStore(Result, LHSPtr);
4299 }
4300 llvm::Value *NextIVal = Builder.CreateNUWSub(
4301 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4302 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4303 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4304 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4305 emitBlock(InnerExitBB, CurFn);
4306 llvm::Value *Next = Builder.CreateNUWAdd(
4307 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4308 Counter->addIncoming(Next, Builder.GetInsertBlock());
4309 // pow2k <<= 1;
4310 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4311 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4312 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4313 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4314 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4315 return Error::success();
4316 };
4317
4318 // TODO: Perform finalization actions for variables. This has to be
4319 // called for variables which have destructors/finalizers.
4320 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4321
4322 llvm::Value *FilterVal = Builder.getInt32(0);
4323 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4324 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4325
4326 if (!AfterIP)
4327 return AfterIP.takeError();
4328 Builder.restoreIP(*AfterIP);
4329 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4330
4331 if (!AfterIP)
4332 return AfterIP.takeError();
4333 Builder.restoreIP(*AfterIP);
4334 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4335 if (Err)
4336 return Err;
4337
4338 return AfterIP;
4339}
4340
4341Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4342 llvm::function_ref<Error()> InputLoopGen,
4343 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4344 ScanInfo *ScanRedInfo) {
4345
4346 {
4347 // Emit loop with input phase:
4348 // for (i: 0..<num_iters>) {
4349 // <input phase>;
4350 // buffer[i] = red;
4351 // }
4352 ScanRedInfo->OMPFirstScanLoop = true;
4353 Error Err = InputLoopGen();
4354 if (Err)
4355 return Err;
4356 }
4357 {
4358 // Emit loop with scan phase:
4359 // for (i: 0..<num_iters>) {
4360 // red = buffer[i];
4361 // <scan phase>;
4362 // }
4363 ScanRedInfo->OMPFirstScanLoop = false;
4364 Error Err = ScanLoopGen(Builder.saveIP());
4365 if (Err)
4366 return Err;
4367 }
4368 return Error::success();
4369}
4370
4371void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4372 Function *Fun = Builder.GetInsertBlock()->getParent();
4373 ScanRedInfo->OMPScanDispatch =
4374 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4375 ScanRedInfo->OMPAfterScanBlock =
4376 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4377 ScanRedInfo->OMPBeforeScanBlock =
4378 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4379 ScanRedInfo->OMPScanLoopExit =
4380 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4381}
4382CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4383 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4384 BasicBlock *PostInsertBefore, const Twine &Name) {
4385 Module *M = F->getParent();
4386 LLVMContext &Ctx = M->getContext();
4387 Type *IndVarTy = TripCount->getType();
4388
4389 // Create the basic block structure.
4390 BasicBlock *Preheader =
4391 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4392 BasicBlock *Header =
4393 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4394 BasicBlock *Cond =
4395 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4396 BasicBlock *Body =
4397 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4398 BasicBlock *Latch =
4399 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4400 BasicBlock *Exit =
4401 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4402 BasicBlock *After =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4404
4405 // Use specified DebugLoc for new instructions.
4406 Builder.SetCurrentDebugLocation(DL);
4407
4408 Builder.SetInsertPoint(Preheader);
4409 Builder.CreateBr(Header);
4410
4411 Builder.SetInsertPoint(Header);
4412 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4413 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4414 Builder.CreateBr(Cond);
4415
4416 Builder.SetInsertPoint(Cond);
4417 Value *Cmp =
4418 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4419 Builder.CreateCondBr(Cmp, Body, Exit);
4420
4421 Builder.SetInsertPoint(Body);
4422 Builder.CreateBr(Latch);
4423
4424 Builder.SetInsertPoint(Latch);
4425 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4426 "omp_" + Name + ".next", /*HasNUW=*/true);
4427 Builder.CreateBr(Header);
4428 IndVarPHI->addIncoming(Next, Latch);
4429
4430 Builder.SetInsertPoint(Exit);
4431 Builder.CreateBr(After);
4432
4433 // Remember and return the canonical control flow.
4434 LoopInfos.emplace_front();
4435 CanonicalLoopInfo *CL = &LoopInfos.front();
4436
4437 CL->Header = Header;
4438 CL->Cond = Cond;
4439 CL->Latch = Latch;
4440 CL->Exit = Exit;
4441
4442#ifndef NDEBUG
4443 CL->assertOK();
4444#endif
4445 return CL;
4446}
4447
4449OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4450 LoopBodyGenCallbackTy BodyGenCB,
4451 Value *TripCount, const Twine &Name) {
4452 BasicBlock *BB = Loc.IP.getBlock();
4453 BasicBlock *NextBB = BB->getNextNode();
4454
4455 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4456 NextBB, NextBB, Name);
4457 BasicBlock *After = CL->getAfter();
4458
4459 // If location is not set, don't connect the loop.
4460 if (updateToLocation(Loc)) {
4461 // Split the loop at the insertion point: Branch to the preheader and move
4462 // every following instruction to after the loop (the After BB). Also, the
4463 // new successor is the loop's after block.
4464 spliceBB(Builder, After, /*CreateBranch=*/false);
4465 Builder.CreateBr(CL->getPreheader());
4466 }
4467
4468 // Emit the body content. We do it after connecting the loop to the CFG to
4469 // avoid that the callback encounters degenerate BBs.
4470 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4471 return Err;
4472
4473#ifndef NDEBUG
4474 CL->assertOK();
4475#endif
4476 return CL;
4477}
4478
4479Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4480 ScanInfos.emplace_front();
4481 ScanInfo *Result = &ScanInfos.front();
4482 return Result;
4483}
4484
4486OpenMPIRBuilder::createCanonicalScanLoops(
4487 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4488 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4489 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4490 LocationDescription ComputeLoc =
4491 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4492 updateToLocation(ComputeLoc);
4493
4495
4496 Value *TripCount = calculateCanonicalLoopTripCount(
4497 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4498 ScanRedInfo->Span = TripCount;
4499 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4500 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4501
4502 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4503 Builder.restoreIP(CodeGenIP);
4504 ScanRedInfo->IV = IV;
4505 createScanBBs(ScanRedInfo);
4506 BasicBlock *InputBlock = Builder.GetInsertBlock();
4507 Instruction *Terminator = InputBlock->getTerminator();
4508 assert(Terminator->getNumSuccessors() == 1);
4509 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4510 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4511 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4512 Builder.GetInsertBlock()->getParent());
4513 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4514 emitBlock(ScanRedInfo->OMPScanLoopExit,
4515 Builder.GetInsertBlock()->getParent());
4516 Builder.CreateBr(ContinueBlock);
4517 Builder.SetInsertPoint(
4518 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4519 return BodyGenCB(Builder.saveIP(), IV);
4520 };
4521
4522 const auto &&InputLoopGen = [&]() -> Error {
4523 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4524 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4525 ComputeIP, Name, true, ScanRedInfo);
4526 if (!LoopInfo)
4527 return LoopInfo.takeError();
4528 Result.push_back(*LoopInfo);
4529 Builder.restoreIP((*LoopInfo)->getAfterIP());
4530 return Error::success();
4531 };
4532 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4534 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4535 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4536 if (!LoopInfo)
4537 return LoopInfo.takeError();
4538 Result.push_back(*LoopInfo);
4539 Builder.restoreIP((*LoopInfo)->getAfterIP());
4540 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4541 return Error::success();
4542 };
4543 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4544 if (Err)
4545 return Err;
4546 return Result;
4547}
4548
4549Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4550 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4551 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4552
4553 // Consider the following difficulties (assuming 8-bit signed integers):
4554 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4555 // DO I = 1, 100, 50
4556 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4557 // DO I = 100, 0, -128
4558
4559 // Start, Stop and Step must be of the same integer type.
4560 auto *IndVarTy = cast<IntegerType>(Start->getType());
4561 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4562 assert(IndVarTy == Step->getType() && "Step type mismatch");
4563
4564 updateToLocation(Loc);
4565
4566 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4567 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4568
4569 // Like Step, but always positive.
4570 Value *Incr = Step;
4571
4572 // Distance between Start and Stop; always positive.
4573 Value *Span;
4574
4575 // Condition whether there are no iterations are executed at all, e.g. because
4576 // UB < LB.
4577 Value *ZeroCmp;
4578
4579 if (IsSigned) {
4580 // Ensure that increment is positive. If not, negate and invert LB and UB.
4581 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4582 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4583 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4584 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4585 Span = Builder.CreateSub(UB, LB, "", false, true);
4586 ZeroCmp = Builder.CreateICmp(
4587 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4588 } else {
4589 Span = Builder.CreateSub(Stop, Start, "", true);
4590 ZeroCmp = Builder.CreateICmp(
4591 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4592 }
4593
4594 Value *CountIfLooping;
4595 if (InclusiveStop) {
4596 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4597 } else {
4598 // Avoid incrementing past stop since it could overflow.
4599 Value *CountIfTwo = Builder.CreateAdd(
4600 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4601 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4602 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4603 }
4604
4605 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4606 "omp_" + Name + ".tripcount");
4607}
4608
4609Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4610 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4611 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4612 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4613 ScanInfo *ScanRedInfo) {
4614 LocationDescription ComputeLoc =
4615 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4616
4617 Value *TripCount = calculateCanonicalLoopTripCount(
4618 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4619
4620 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4621 Builder.restoreIP(CodeGenIP);
4622 Value *Span = Builder.CreateMul(IV, Step);
4623 Value *IndVar = Builder.CreateAdd(Span, Start);
4624 if (InScan)
4625 ScanRedInfo->IV = IndVar;
4626 return BodyGenCB(Builder.saveIP(), IndVar);
4627 };
4628 LocationDescription LoopLoc =
4629 ComputeIP.isSet()
4630 ? Loc
4631 : LocationDescription(Builder.saveIP(),
4632 Builder.getCurrentDebugLocation());
4633 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4634}
4635
4636// Returns an LLVM function to call for initializing loop bounds using OpenMP
4637// static scheduling for composite `distribute parallel for` depending on
4638// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4639// integers as unsigned similarly to CanonicalLoopInfo.
4640static FunctionCallee
4642 OpenMPIRBuilder &OMPBuilder) {
4643 unsigned Bitwidth = Ty->getIntegerBitWidth();
4644 if (Bitwidth == 32)
4645 return OMPBuilder.getOrCreateRuntimeFunction(
4646 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4647 if (Bitwidth == 64)
4648 return OMPBuilder.getOrCreateRuntimeFunction(
4649 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4650 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4651}
4652
4653// Returns an LLVM function to call for initializing loop bounds using OpenMP
4654// static scheduling depending on `type`. Only i32 and i64 are supported by the
4655// runtime. Always interpret integers as unsigned similarly to
4656// CanonicalLoopInfo.
4658 OpenMPIRBuilder &OMPBuilder) {
4659 unsigned Bitwidth = Ty->getIntegerBitWidth();
4660 if (Bitwidth == 32)
4661 return OMPBuilder.getOrCreateRuntimeFunction(
4662 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4663 if (Bitwidth == 64)
4664 return OMPBuilder.getOrCreateRuntimeFunction(
4665 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4666 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4667}
4668
4669OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4670 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4671 WorksharingLoopType LoopType, bool NeedsBarrier) {
4672 assert(CLI->isValid() && "Requires a valid canonical loop");
4673 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4674 "Require dedicated allocate IP");
4675
4676 // Set up the source location value for OpenMP runtime.
4677 Builder.restoreIP(CLI->getPreheaderIP());
4678 Builder.SetCurrentDebugLocation(DL);
4679
4680 uint32_t SrcLocStrSize;
4681 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4682 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4683
4684 // Declare useful OpenMP runtime functions.
4685 Value *IV = CLI->getIndVar();
4686 Type *IVTy = IV->getType();
4687 FunctionCallee StaticInit =
4688 LoopType == WorksharingLoopType::DistributeForStaticLoop
4689 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4690 : getKmpcForStaticInitForType(IVTy, M, *this);
4691 FunctionCallee StaticFini =
4692 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4693
4694 // Allocate space for computed loop bounds as expected by the "init" function.
4695 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4696
4697 Type *I32Type = Type::getInt32Ty(M.getContext());
4698 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4699 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4700 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4701 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4702 CLI->setLastIter(PLastIter);
4703
4704 // At the end of the preheader, prepare for calling the "init" function by
4705 // storing the current loop bounds into the allocated space. A canonical loop
4706 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4707 // and produces an inclusive upper bound.
4708 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4709 Constant *Zero = ConstantInt::get(IVTy, 0);
4710 Constant *One = ConstantInt::get(IVTy, 1);
4711 Builder.CreateStore(Zero, PLowerBound);
4712 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4713 Builder.CreateStore(UpperBound, PUpperBound);
4714 Builder.CreateStore(One, PStride);
4715
4716 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4717
4718 OMPScheduleType SchedType =
4719 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4720 ? OMPScheduleType::OrderedDistribute
4722 Constant *SchedulingType =
4723 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4724
4725 // Call the "init" function and update the trip count of the loop with the
4726 // value it produced.
4728 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4729 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4730 Value *PDistUpperBound =
4731 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4732 Args.push_back(PDistUpperBound);
4733 }
4734 Args.append({PStride, One, Zero});
4735 Builder.CreateCall(StaticInit, Args);
4736 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4737 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4738 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4739 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4740 CLI->setTripCount(TripCount);
4741
4742 // Update all uses of the induction variable except the one in the condition
4743 // block that compares it with the actual upper bound, and the increment in
4744 // the latch block.
4745
4746 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4747 Builder.SetInsertPoint(CLI->getBody(),
4748 CLI->getBody()->getFirstInsertionPt());
4749 Builder.SetCurrentDebugLocation(DL);
4750 return Builder.CreateAdd(OldIV, LowerBound);
4751 });
4752
4753 // In the "exit" block, call the "fini" function.
4754 Builder.SetInsertPoint(CLI->getExit(),
4755 CLI->getExit()->getTerminator()->getIterator());
4756 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4757
4758 // Add the barrier if requested.
4759 if (NeedsBarrier) {
4760 InsertPointOrErrorTy BarrierIP =
4761 createBarrier(LocationDescription(Builder.saveIP(), DL),
4762 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4763 /* CheckCancelFlag */ false);
4764 if (!BarrierIP)
4765 return BarrierIP.takeError();
4766 }
4767
4768 InsertPointTy AfterIP = CLI->getAfterIP();
4769 CLI->invalidate();
4770
4771 return AfterIP;
4772}
4773
4774OpenMPIRBuilder::InsertPointOrErrorTy
4775OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4776 CanonicalLoopInfo *CLI,
4777 InsertPointTy AllocaIP,
4778 bool NeedsBarrier,
4779 Value *ChunkSize) {
4780 assert(CLI->isValid() && "Requires a valid canonical loop");
4781 assert(ChunkSize && "Chunk size is required");
4782
4783 LLVMContext &Ctx = CLI->getFunction()->getContext();
4784 Value *IV = CLI->getIndVar();
4785 Value *OrigTripCount = CLI->getTripCount();
4786 Type *IVTy = IV->getType();
4787 assert(IVTy->getIntegerBitWidth() <= 64 &&
4788 "Max supported tripcount bitwidth is 64 bits");
4789 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4790 : Type::getInt64Ty(Ctx);
4791 Type *I32Type = Type::getInt32Ty(M.getContext());
4792 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4793 Constant *One = ConstantInt::get(InternalIVTy, 1);
4794
4795 // Declare useful OpenMP runtime functions.
4796 FunctionCallee StaticInit =
4797 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4798 FunctionCallee StaticFini =
4799 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4800
4801 // Allocate space for computed loop bounds as expected by the "init" function.
4802 Builder.restoreIP(AllocaIP);
4803 Builder.SetCurrentDebugLocation(DL);
4804 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4805 Value *PLowerBound =
4806 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4807 Value *PUpperBound =
4808 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4809 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4810 CLI->setLastIter(PLastIter);
4811
4812 // Set up the source location value for the OpenMP runtime.
4813 Builder.restoreIP(CLI->getPreheaderIP());
4814 Builder.SetCurrentDebugLocation(DL);
4815
4816 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4817 Value *CastedChunkSize =
4818 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4819 Value *CastedTripCount =
4820 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4821
4822 Constant *SchedulingType = ConstantInt::get(
4823 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4824 Builder.CreateStore(Zero, PLowerBound);
4825 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4826 Builder.CreateStore(OrigUpperBound, PUpperBound);
4827 Builder.CreateStore(One, PStride);
4828
4829 // Call the "init" function and update the trip count of the loop with the
4830 // value it produced.
4831 uint32_t SrcLocStrSize;
4832 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4833 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4834 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4835 Builder.CreateCall(StaticInit,
4836 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4837 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4838 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4839 /*pstride=*/PStride, /*incr=*/One,
4840 /*chunk=*/CastedChunkSize});
4841
4842 // Load values written by the "init" function.
4843 Value *FirstChunkStart =
4844 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4845 Value *FirstChunkStop =
4846 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4847 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4848 Value *ChunkRange =
4849 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4850 Value *NextChunkStride =
4851 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4852
4853 // Create outer "dispatch" loop for enumerating the chunks.
4854 BasicBlock *DispatchEnter = splitBB(Builder, true);
4855 Value *DispatchCounter;
4856
4857 // It is safe to assume this didn't return an error because the callback
4858 // passed into createCanonicalLoop is the only possible error source, and it
4859 // always returns success.
4860 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4861 {Builder.saveIP(), DL},
4862 [&](InsertPointTy BodyIP, Value *Counter) {
4863 DispatchCounter = Counter;
4864 return Error::success();
4865 },
4866 FirstChunkStart, CastedTripCount, NextChunkStride,
4867 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4868 "dispatch"));
4869
4870 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4871 // not have to preserve the canonical invariant.
4872 BasicBlock *DispatchBody = DispatchCLI->getBody();
4873 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4874 BasicBlock *DispatchExit = DispatchCLI->getExit();
4875 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4876 DispatchCLI->invalidate();
4877
4878 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4879 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4880 redirectTo(CLI->getExit(), DispatchLatch, DL);
4881 redirectTo(DispatchBody, DispatchEnter, DL);
4882
4883 // Prepare the prolog of the chunk loop.
4884 Builder.restoreIP(CLI->getPreheaderIP());
4885 Builder.SetCurrentDebugLocation(DL);
4886
4887 // Compute the number of iterations of the chunk loop.
4888 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4889 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4890 Value *IsLastChunk =
4891 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4892 Value *CountUntilOrigTripCount =
4893 Builder.CreateSub(CastedTripCount, DispatchCounter);
4894 Value *ChunkTripCount = Builder.CreateSelect(
4895 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4896 Value *BackcastedChunkTC =
4897 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4898 CLI->setTripCount(BackcastedChunkTC);
4899
4900 // Update all uses of the induction variable except the one in the condition
4901 // block that compares it with the actual upper bound, and the increment in
4902 // the latch block.
4903 Value *BackcastedDispatchCounter =
4904 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4905 CLI->mapIndVar([&](Instruction *) -> Value * {
4906 Builder.restoreIP(CLI->getBodyIP());
4907 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4908 });
4909
4910 // In the "exit" block, call the "fini" function.
4911 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4912 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4913
4914 // Add the barrier if requested.
4915 if (NeedsBarrier) {
4916 InsertPointOrErrorTy AfterIP =
4917 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4918 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4919 if (!AfterIP)
4920 return AfterIP.takeError();
4921 }
4922
4923#ifndef NDEBUG
4924 // Even though we currently do not support applying additional methods to it,
4925 // the chunk loop should remain a canonical loop.
4926 CLI->assertOK();
4927#endif
4928
4929 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4930}
4931
4932// Returns an LLVM function to call for executing an OpenMP static worksharing
4933// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4934// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4935static FunctionCallee
4936getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4937 WorksharingLoopType LoopType) {
4938 unsigned Bitwidth = Ty->getIntegerBitWidth();
4939 Module &M = OMPBuilder->M;
4940 switch (LoopType) {
4941 case WorksharingLoopType::ForStaticLoop:
4942 if (Bitwidth == 32)
4943 return OMPBuilder->getOrCreateRuntimeFunction(
4944 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4945 if (Bitwidth == 64)
4946 return OMPBuilder->getOrCreateRuntimeFunction(
4947 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4948 break;
4949 case WorksharingLoopType::DistributeStaticLoop:
4950 if (Bitwidth == 32)
4951 return OMPBuilder->getOrCreateRuntimeFunction(
4952 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4953 if (Bitwidth == 64)
4954 return OMPBuilder->getOrCreateRuntimeFunction(
4955 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4956 break;
4957 case WorksharingLoopType::DistributeForStaticLoop:
4958 if (Bitwidth == 32)
4959 return OMPBuilder->getOrCreateRuntimeFunction(
4960 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4961 if (Bitwidth == 64)
4962 return OMPBuilder->getOrCreateRuntimeFunction(
4963 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4964 break;
4965 }
4966 if (Bitwidth != 32 && Bitwidth != 64) {
4967 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4968 }
4969 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4970}
4971
4972// Inserts a call to proper OpenMP Device RTL function which handles
4973// loop worksharing.
4974static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4975 WorksharingLoopType LoopType,
4976 BasicBlock *InsertBlock, Value *Ident,
4977 Value *LoopBodyArg, Value *TripCount,
4978 Function &LoopBodyFn) {
4979 Type *TripCountTy = TripCount->getType();
4980 Module &M = OMPBuilder->M;
4981 IRBuilder<> &Builder = OMPBuilder->Builder;
4982 FunctionCallee RTLFn =
4983 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4984 SmallVector<Value *, 8> RealArgs;
4985 RealArgs.push_back(Ident);
4986 RealArgs.push_back(&LoopBodyFn);
4987 RealArgs.push_back(LoopBodyArg);
4988 RealArgs.push_back(TripCount);
4989 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4990 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4991 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4992 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4993 Builder.CreateCall(RTLFn, RealArgs);
4994 return;
4995 }
4996 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4997 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4998 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4999 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5000
5001 RealArgs.push_back(
5002 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5003 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5004 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5005 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5006 }
5007 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5008
5009 Builder.CreateCall(RTLFn, RealArgs);
5010}
5011
5013 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5014 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5015 WorksharingLoopType LoopType) {
5016 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5017 BasicBlock *Preheader = CLI->getPreheader();
5018 Value *TripCount = CLI->getTripCount();
5019
5020 // After loop body outling, the loop body contains only set up
5021 // of loop body argument structure and the call to the outlined
5022 // loop body function. Firstly, we need to move setup of loop body args
5023 // into loop preheader.
5024 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5025 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5026
5027 // The next step is to remove the whole loop. We do not it need anymore.
5028 // That's why make an unconditional branch from loop preheader to loop
5029 // exit block
5030 Builder.restoreIP({Preheader, Preheader->end()});
5031 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5032 Preheader->getTerminator()->eraseFromParent();
5033 Builder.CreateBr(CLI->getExit());
5034
5035 // Delete dead loop blocks
5036 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5037 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5038 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5039 CleanUpInfo.EntryBB = CLI->getHeader();
5040 CleanUpInfo.ExitBB = CLI->getExit();
5041 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5042 DeleteDeadBlocks(BlocksToBeRemoved);
5043
5044 // Find the instruction which corresponds to loop body argument structure
5045 // and remove the call to loop body function instruction.
5046 Value *LoopBodyArg;
5047 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5048 assert(OutlinedFnUser &&
5049 "Expected unique undroppable user of outlined function");
5050 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5051 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5052 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5053 "Expected outlined function call to be located in loop preheader");
5054 // Check in case no argument structure has been passed.
5055 if (OutlinedFnCallInstruction->arg_size() > 1)
5056 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5057 else
5058 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5059 OutlinedFnCallInstruction->eraseFromParent();
5060
5061 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5062 LoopBodyArg, TripCount, OutlinedFn);
5063
5064 for (auto &ToBeDeletedItem : ToBeDeleted)
5065 ToBeDeletedItem->eraseFromParent();
5066 CLI->invalidate();
5067}
5068
5069OpenMPIRBuilder::InsertPointTy
5070OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5071 InsertPointTy AllocaIP,
5072 WorksharingLoopType LoopType) {
5073 uint32_t SrcLocStrSize;
5074 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5075 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5076
5077 OutlineInfo OI;
5078 OI.OuterAllocaBB = CLI->getPreheader();
5079 Function *OuterFn = CLI->getPreheader()->getParent();
5080
5081 // Instructions which need to be deleted at the end of code generation
5083
5084 OI.OuterAllocaBB = AllocaIP.getBlock();
5085
5086 // Mark the body loop as region which needs to be extracted
5087 OI.EntryBB = CLI->getBody();
5088 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5089 "omp.prelatch", true);
5090
5091 // Prepare loop body for extraction
5092 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5093
5094 // Insert new loop counter variable which will be used only in loop
5095 // body.
5096 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5097 Instruction *NewLoopCntLoad =
5098 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5099 // New loop counter instructions are redundant in the loop preheader when
5100 // code generation for workshare loop is finshed. That's why mark them as
5101 // ready for deletion.
5102 ToBeDeleted.push_back(NewLoopCntLoad);
5103 ToBeDeleted.push_back(NewLoopCnt);
5104
5105 // Analyse loop body region. Find all input variables which are used inside
5106 // loop body region.
5107 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5109 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5110
5111 CodeExtractorAnalysisCache CEAC(*OuterFn);
5112 CodeExtractor Extractor(Blocks,
5113 /* DominatorTree */ nullptr,
5114 /* AggregateArgs */ true,
5115 /* BlockFrequencyInfo */ nullptr,
5116 /* BranchProbabilityInfo */ nullptr,
5117 /* AssumptionCache */ nullptr,
5118 /* AllowVarArgs */ true,
5119 /* AllowAlloca */ true,
5120 /* AllocationBlock */ CLI->getPreheader(),
5121 /* Suffix */ ".omp_wsloop",
5122 /* AggrArgsIn0AddrSpace */ true);
5123
5124 BasicBlock *CommonExit = nullptr;
5125 SetVector<Value *> SinkingCands, HoistingCands;
5126
5127 // Find allocas outside the loop body region which are used inside loop
5128 // body
5129 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5130
5131 // We need to model loop body region as the function f(cnt, loop_arg).
5132 // That's why we replace loop induction variable by the new counter
5133 // which will be one of loop body function argument
5134 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5135 CLI->getIndVar()->user_end());
5136 for (auto Use : Users) {
5137 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5138 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5139 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5140 }
5141 }
5142 }
5143 // Make sure that loop counter variable is not merged into loop body
5144 // function argument structure and it is passed as separate variable
5145 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5146
5147 // PostOutline CB is invoked when loop body function is outlined and
5148 // loop body is replaced by call to outlined function. We need to add
5149 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5150 // function will handle loop control logic.
5151 //
5152 OI.PostOutlineCB = [=, ToBeDeletedVec =
5153 std::move(ToBeDeleted)](Function &OutlinedFn) {
5154 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5155 LoopType);
5156 };
5157 addOutlineInfo(std::move(OI));
5158 return CLI->getAfterIP();
5159}
5160
5161OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5162 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5163 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5164 bool HasSimdModifier, bool HasMonotonicModifier,
5165 bool HasNonmonotonicModifier, bool HasOrderedClause,
5166 WorksharingLoopType LoopType) {
5167 if (Config.isTargetDevice())
5168 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5169 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5170 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5171 HasNonmonotonicModifier, HasOrderedClause);
5172
5173 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5174 OMPScheduleType::ModifierOrdered;
5175 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5176 case OMPScheduleType::BaseStatic:
5177 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5178 if (IsOrdered)
5179 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5180 NeedsBarrier, ChunkSize);
5181 // FIXME: Monotonicity ignored?
5182 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5183
5184 case OMPScheduleType::BaseStaticChunked:
5185 if (IsOrdered)
5186 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5187 NeedsBarrier, ChunkSize);
5188 // FIXME: Monotonicity ignored?
5189 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5190 ChunkSize);
5191
5192 case OMPScheduleType::BaseRuntime:
5193 case OMPScheduleType::BaseAuto:
5194 case OMPScheduleType::BaseGreedy:
5195 case OMPScheduleType::BaseBalanced:
5196 case OMPScheduleType::BaseSteal:
5197 case OMPScheduleType::BaseGuidedSimd:
5198 case OMPScheduleType::BaseRuntimeSimd:
5199 assert(!ChunkSize &&
5200 "schedule type does not support user-defined chunk sizes");
5201 [[fallthrough]];
5202 case OMPScheduleType::BaseDynamicChunked:
5203 case OMPScheduleType::BaseGuidedChunked:
5204 case OMPScheduleType::BaseGuidedIterativeChunked:
5205 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5206 case OMPScheduleType::BaseStaticBalancedChunked:
5207 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5208 NeedsBarrier, ChunkSize);
5209
5210 default:
5211 llvm_unreachable("Unknown/unimplemented schedule kind");
5212 }
5213}
5214
5215/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5216/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5217/// the runtime. Always interpret integers as unsigned similarly to
5218/// CanonicalLoopInfo.
5219static FunctionCallee
5220getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5221 unsigned Bitwidth = Ty->getIntegerBitWidth();
5222 if (Bitwidth == 32)
5223 return OMPBuilder.getOrCreateRuntimeFunction(
5224 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5225 if (Bitwidth == 64)
5226 return OMPBuilder.getOrCreateRuntimeFunction(
5227 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5228 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5229}
5230
5231/// Returns an LLVM function to call for updating the next loop using OpenMP
5232/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5233/// the runtime. Always interpret integers as unsigned similarly to
5234/// CanonicalLoopInfo.
5235static FunctionCallee
5236getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5237 unsigned Bitwidth = Ty->getIntegerBitWidth();
5238 if (Bitwidth == 32)
5239 return OMPBuilder.getOrCreateRuntimeFunction(
5240 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5241 if (Bitwidth == 64)
5242 return OMPBuilder.getOrCreateRuntimeFunction(
5243 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5244 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5245}
5246
5247/// Returns an LLVM function to call for finalizing the dynamic loop using
5248/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5249/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5250static FunctionCallee
5251getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5252 unsigned Bitwidth = Ty->getIntegerBitWidth();
5253 if (Bitwidth == 32)
5254 return OMPBuilder.getOrCreateRuntimeFunction(
5255 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5256 if (Bitwidth == 64)
5257 return OMPBuilder.getOrCreateRuntimeFunction(
5258 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5259 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5260}
5261
5262OpenMPIRBuilder::InsertPointOrErrorTy
5263OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5264 InsertPointTy AllocaIP,
5265 OMPScheduleType SchedType,
5266 bool NeedsBarrier, Value *Chunk) {
5267 assert(CLI->isValid() && "Requires a valid canonical loop");
5268 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5269 "Require dedicated allocate IP");
5271 "Require valid schedule type");
5272
5273 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5274 OMPScheduleType::ModifierOrdered;
5275
5276 // Set up the source location value for OpenMP runtime.
5277 Builder.SetCurrentDebugLocation(DL);
5278
5279 uint32_t SrcLocStrSize;
5280 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5281 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5282
5283 // Declare useful OpenMP runtime functions.
5284 Value *IV = CLI->getIndVar();
5285 Type *IVTy = IV->getType();
5286 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5287 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5288
5289 // Allocate space for computed loop bounds as expected by the "init" function.
5290 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5291 Type *I32Type = Type::getInt32Ty(M.getContext());
5292 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5293 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5294 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5295 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5296 CLI->setLastIter(PLastIter);
5297
5298 // At the end of the preheader, prepare for calling the "init" function by
5299 // storing the current loop bounds into the allocated space. A canonical loop
5300 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5301 // and produces an inclusive upper bound.
5302 BasicBlock *PreHeader = CLI->getPreheader();
5303 Builder.SetInsertPoint(PreHeader->getTerminator());
5304 Constant *One = ConstantInt::get(IVTy, 1);
5305 Builder.CreateStore(One, PLowerBound);
5306 Value *UpperBound = CLI->getTripCount();
5307 Builder.CreateStore(UpperBound, PUpperBound);
5308 Builder.CreateStore(One, PStride);
5309
5310 BasicBlock *Header = CLI->getHeader();
5311 BasicBlock *Exit = CLI->getExit();
5312 BasicBlock *Cond = CLI->getCond();
5313 BasicBlock *Latch = CLI->getLatch();
5314 InsertPointTy AfterIP = CLI->getAfterIP();
5315
5316 // The CLI will be "broken" in the code below, as the loop is no longer
5317 // a valid canonical loop.
5318
5319 if (!Chunk)
5320 Chunk = One;
5321
5322 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5323
5324 Constant *SchedulingType =
5325 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5326
5327 // Call the "init" function.
5328 Builder.CreateCall(DynamicInit,
5329 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5330 UpperBound, /* step */ One, Chunk});
5331
5332 // An outer loop around the existing one.
5333 BasicBlock *OuterCond = BasicBlock::Create(
5334 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5335 PreHeader->getParent());
5336 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5337 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5338 Value *Res =
5339 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5340 PLowerBound, PUpperBound, PStride});
5341 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5342 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5343 Value *LowerBound =
5344 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5345 Builder.CreateCondBr(MoreWork, Header, Exit);
5346
5347 // Change PHI-node in loop header to use outer cond rather than preheader,
5348 // and set IV to the LowerBound.
5349 Instruction *Phi = &Header->front();
5350 auto *PI = cast<PHINode>(Phi);
5351 PI->setIncomingBlock(0, OuterCond);
5352 PI->setIncomingValue(0, LowerBound);
5353
5354 // Then set the pre-header to jump to the OuterCond
5355 Instruction *Term = PreHeader->getTerminator();
5356 auto *Br = cast<BranchInst>(Term);
5357 Br->setSuccessor(0, OuterCond);
5358
5359 // Modify the inner condition:
5360 // * Use the UpperBound returned from the DynamicNext call.
5361 // * jump to the loop outer loop when done with one of the inner loops.
5362 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5363 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5364 Instruction *Comp = &*Builder.GetInsertPoint();
5365 auto *CI = cast<CmpInst>(Comp);
5366 CI->setOperand(1, UpperBound);
5367 // Redirect the inner exit to branch to outer condition.
5368 Instruction *Branch = &Cond->back();
5369 auto *BI = cast<BranchInst>(Branch);
5370 assert(BI->getSuccessor(1) == Exit);
5371 BI->setSuccessor(1, OuterCond);
5372
5373 // Call the "fini" function if "ordered" is present in wsloop directive.
5374 if (Ordered) {
5375 Builder.SetInsertPoint(&Latch->back());
5376 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5377 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5378 }
5379
5380 // Add the barrier if requested.
5381 if (NeedsBarrier) {
5382 Builder.SetInsertPoint(&Exit->back());
5383 InsertPointOrErrorTy BarrierIP =
5384 createBarrier(LocationDescription(Builder.saveIP(), DL),
5385 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5386 /* CheckCancelFlag */ false);
5387 if (!BarrierIP)
5388 return BarrierIP.takeError();
5389 }
5390
5391 CLI->invalidate();
5392 return AfterIP;
5393}
5394
5395/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5396/// after this \p OldTarget will be orphaned.
5398 BasicBlock *NewTarget, DebugLoc DL) {
5399 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5400 redirectTo(Pred, NewTarget, DL);
5401}
5402
5403/// Determine which blocks in \p BBs are reachable from outside and remove the
5404/// ones that are not reachable from the function.
5407 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5408 for (Use &U : BB->uses()) {
5409 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5410 if (!UseInst)
5411 continue;
5412 if (BBsToErase.count(UseInst->getParent()))
5413 continue;
5414 return true;
5415 }
5416 return false;
5417 };
5418
5419 while (BBsToErase.remove_if(HasRemainingUses)) {
5420 // Try again if anything was removed.
5421 }
5422
5423 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5424 DeleteDeadBlocks(BBVec);
5425}
5426
5427CanonicalLoopInfo *
5428OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5429 InsertPointTy ComputeIP) {
5430 assert(Loops.size() >= 1 && "At least one loop required");
5431 size_t NumLoops = Loops.size();
5432
5433 // Nothing to do if there is already just one loop.
5434 if (NumLoops == 1)
5435 return Loops.front();
5436
5437 CanonicalLoopInfo *Outermost = Loops.front();
5438 CanonicalLoopInfo *Innermost = Loops.back();
5439 BasicBlock *OrigPreheader = Outermost->getPreheader();
5440 BasicBlock *OrigAfter = Outermost->getAfter();
5441 Function *F = OrigPreheader->getParent();
5442
5443 // Loop control blocks that may become orphaned later.
5444 SmallVector<BasicBlock *, 12> OldControlBBs;
5445 OldControlBBs.reserve(6 * Loops.size());
5446 for (CanonicalLoopInfo *Loop : Loops)
5447 Loop->collectControlBlocks(OldControlBBs);
5448
5449 // Setup the IRBuilder for inserting the trip count computation.
5450 Builder.SetCurrentDebugLocation(DL);
5451 if (ComputeIP.isSet())
5452 Builder.restoreIP(ComputeIP);
5453 else
5454 Builder.restoreIP(Outermost->getPreheaderIP());
5455
5456 // Derive the collapsed' loop trip count.
5457 // TODO: Find common/largest indvar type.
5458 Value *CollapsedTripCount = nullptr;
5459 for (CanonicalLoopInfo *L : Loops) {
5460 assert(L->isValid() &&
5461 "All loops to collapse must be valid canonical loops");
5462 Value *OrigTripCount = L->getTripCount();
5463 if (!CollapsedTripCount) {
5464 CollapsedTripCount = OrigTripCount;
5465 continue;
5466 }
5467
5468 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5469 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5470 }
5471
5472 // Create the collapsed loop control flow.
5473 CanonicalLoopInfo *Result =
5474 createLoopSkeleton(DL, CollapsedTripCount, F,
5475 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5476
5477 // Build the collapsed loop body code.
5478 // Start with deriving the input loop induction variables from the collapsed
5479 // one, using a divmod scheme. To preserve the original loops' order, the
5480 // innermost loop use the least significant bits.
5481 Builder.restoreIP(Result->getBodyIP());
5482
5483 Value *Leftover = Result->getIndVar();
5484 SmallVector<Value *> NewIndVars;
5485 NewIndVars.resize(NumLoops);
5486 for (int i = NumLoops - 1; i >= 1; --i) {
5487 Value *OrigTripCount = Loops[i]->getTripCount();
5488
5489 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5490 NewIndVars[i] = NewIndVar;
5491
5492 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5493 }
5494 // Outermost loop gets all the remaining bits.
5495 NewIndVars[0] = Leftover;
5496
5497 // Construct the loop body control flow.
5498 // We progressively construct the branch structure following in direction of
5499 // the control flow, from the leading in-between code, the loop nest body, the
5500 // trailing in-between code, and rejoining the collapsed loop's latch.
5501 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5502 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5503 // its predecessors as sources.
5504 BasicBlock *ContinueBlock = Result->getBody();
5505 BasicBlock *ContinuePred = nullptr;
5506 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5507 BasicBlock *NextSrc) {
5508 if (ContinueBlock)
5509 redirectTo(ContinueBlock, Dest, DL);
5510 else
5511 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5512
5513 ContinueBlock = nullptr;
5514 ContinuePred = NextSrc;
5515 };
5516
5517 // The code before the nested loop of each level.
5518 // Because we are sinking it into the nest, it will be executed more often
5519 // that the original loop. More sophisticated schemes could keep track of what
5520 // the in-between code is and instantiate it only once per thread.
5521 for (size_t i = 0; i < NumLoops - 1; ++i)
5522 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5523
5524 // Connect the loop nest body.
5525 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5526
5527 // The code after the nested loop at each level.
5528 for (size_t i = NumLoops - 1; i > 0; --i)
5529 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5530
5531 // Connect the finished loop to the collapsed loop latch.
5532 ContinueWith(Result->getLatch(), nullptr);
5533
5534 // Replace the input loops with the new collapsed loop.
5535 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5536 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5537
5538 // Replace the input loop indvars with the derived ones.
5539 for (size_t i = 0; i < NumLoops; ++i)
5540 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5541
5542 // Remove unused parts of the input loops.
5543 removeUnusedBlocksFromParent(OldControlBBs);
5544
5545 for (CanonicalLoopInfo *L : Loops)
5546 L->invalidate();
5547
5548#ifndef NDEBUG
5549 Result->assertOK();
5550#endif
5551 return Result;
5552}
5553
5554std::vector<CanonicalLoopInfo *>
5555OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5556 ArrayRef<Value *> TileSizes) {
5557 assert(TileSizes.size() == Loops.size() &&
5558 "Must pass as many tile sizes as there are loops");
5559 int NumLoops = Loops.size();
5560 assert(NumLoops >= 1 && "At least one loop to tile required");
5561
5562 CanonicalLoopInfo *OutermostLoop = Loops.front();
5563 CanonicalLoopInfo *InnermostLoop = Loops.back();
5564 Function *F = OutermostLoop->getBody()->getParent();
5565 BasicBlock *InnerEnter = InnermostLoop->getBody();
5566 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5567
5568 // Loop control blocks that may become orphaned later.
5569 SmallVector<BasicBlock *, 12> OldControlBBs;
5570 OldControlBBs.reserve(6 * Loops.size());
5571 for (CanonicalLoopInfo *Loop : Loops)
5572 Loop->collectControlBlocks(OldControlBBs);
5573
5574 // Collect original trip counts and induction variable to be accessible by
5575 // index. Also, the structure of the original loops is not preserved during
5576 // the construction of the tiled loops, so do it before we scavenge the BBs of
5577 // any original CanonicalLoopInfo.
5578 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5579 for (CanonicalLoopInfo *L : Loops) {
5580 assert(L->isValid() && "All input loops must be valid canonical loops");
5581 OrigTripCounts.push_back(L->getTripCount());
5582 OrigIndVars.push_back(L->getIndVar());
5583 }
5584
5585 // Collect the code between loop headers. These may contain SSA definitions
5586 // that are used in the loop nest body. To be usable with in the innermost
5587 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5588 // these instructions may be executed more often than before the tiling.
5589 // TODO: It would be sufficient to only sink them into body of the
5590 // corresponding tile loop.
5592 for (int i = 0; i < NumLoops - 1; ++i) {
5593 CanonicalLoopInfo *Surrounding = Loops[i];
5594 CanonicalLoopInfo *Nested = Loops[i + 1];
5595
5596 BasicBlock *EnterBB = Surrounding->getBody();
5597 BasicBlock *ExitBB = Nested->getHeader();
5598 InbetweenCode.emplace_back(EnterBB, ExitBB);
5599 }
5600
5601 // Compute the trip counts of the floor loops.
5602 Builder.SetCurrentDebugLocation(DL);
5603 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5604 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5605 for (int i = 0; i < NumLoops; ++i) {
5606 Value *TileSize = TileSizes[i];
5607 Value *OrigTripCount = OrigTripCounts[i];
5608 Type *IVType = OrigTripCount->getType();
5609
5610 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5611 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5612
5613 // 0 if tripcount divides the tilesize, 1 otherwise.
5614 // 1 means we need an additional iteration for a partial tile.
5615 //
5616 // Unfortunately we cannot just use the roundup-formula
5617 // (tripcount + tilesize - 1)/tilesize
5618 // because the summation might overflow. We do not want introduce undefined
5619 // behavior when the untiled loop nest did not.
5620 Value *FloorTripOverflow =
5621 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5622
5623 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5624 Value *FloorTripCount =
5625 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5626 "omp_floor" + Twine(i) + ".tripcount", true);
5627
5628 // Remember some values for later use.
5629 FloorCompleteCount.push_back(FloorCompleteTripCount);
5630 FloorCount.push_back(FloorTripCount);
5631 FloorRems.push_back(FloorTripRem);
5632 }
5633
5634 // Generate the new loop nest, from the outermost to the innermost.
5635 std::vector<CanonicalLoopInfo *> Result;
5636 Result.reserve(NumLoops * 2);
5637
5638 // The basic block of the surrounding loop that enters the nest generated
5639 // loop.
5640 BasicBlock *Enter = OutermostLoop->getPreheader();
5641
5642 // The basic block of the surrounding loop where the inner code should
5643 // continue.
5644 BasicBlock *Continue = OutermostLoop->getAfter();
5645
5646 // Where the next loop basic block should be inserted.
5647 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5648
5649 auto EmbeddNewLoop =
5650 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5651 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5652 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5653 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5654 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5655 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5656
5657 // Setup the position where the next embedded loop connects to this loop.
5658 Enter = EmbeddedLoop->getBody();
5659 Continue = EmbeddedLoop->getLatch();
5660 OutroInsertBefore = EmbeddedLoop->getLatch();
5661 return EmbeddedLoop;
5662 };
5663
5664 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5665 const Twine &NameBase) {
5666 for (auto P : enumerate(TripCounts)) {
5667 CanonicalLoopInfo *EmbeddedLoop =
5668 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5669 Result.push_back(EmbeddedLoop);
5670 }
5671 };
5672
5673 EmbeddNewLoops(FloorCount, "floor");
5674
5675 // Within the innermost floor loop, emit the code that computes the tile
5676 // sizes.
5677 Builder.SetInsertPoint(Enter->getTerminator());
5678 SmallVector<Value *, 4> TileCounts;
5679 for (int i = 0; i < NumLoops; ++i) {
5680 CanonicalLoopInfo *FloorLoop = Result[i];
5681 Value *TileSize = TileSizes[i];
5682
5683 Value *FloorIsEpilogue =
5684 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5685 Value *TileTripCount =
5686 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5687
5688 TileCounts.push_back(TileTripCount);
5689 }
5690
5691 // Create the tile loops.
5692 EmbeddNewLoops(TileCounts, "tile");
5693
5694 // Insert the inbetween code into the body.
5695 BasicBlock *BodyEnter = Enter;
5696 BasicBlock *BodyEntered = nullptr;
5697 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5698 BasicBlock *EnterBB = P.first;
5699 BasicBlock *ExitBB = P.second;
5700
5701 if (BodyEnter)
5702 redirectTo(BodyEnter, EnterBB, DL);
5703 else
5704 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5705
5706 BodyEnter = nullptr;
5707 BodyEntered = ExitBB;
5708 }
5709
5710 // Append the original loop nest body into the generated loop nest body.
5711 if (BodyEnter)
5712 redirectTo(BodyEnter, InnerEnter, DL);
5713 else
5714 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5716
5717 // Replace the original induction variable with an induction variable computed
5718 // from the tile and floor induction variables.
5719 Builder.restoreIP(Result.back()->getBodyIP());
5720 for (int i = 0; i < NumLoops; ++i) {
5721 CanonicalLoopInfo *FloorLoop = Result[i];
5722 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5723 Value *OrigIndVar = OrigIndVars[i];
5724 Value *Size = TileSizes[i];
5725
5726 Value *Scale =
5727 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5728 Value *Shift =
5729 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5730 OrigIndVar->replaceAllUsesWith(Shift);
5731 }
5732
5733 // Remove unused parts of the original loops.
5734 removeUnusedBlocksFromParent(OldControlBBs);
5735
5736 for (CanonicalLoopInfo *L : Loops)
5737 L->invalidate();
5738
5739#ifndef NDEBUG
5740 for (CanonicalLoopInfo *GenL : Result)
5741 GenL->assertOK();
5742#endif
5743 return Result;
5744}
5745
5746/// Attach metadata \p Properties to the basic block described by \p BB. If the
5747/// basic block already has metadata, the basic block properties are appended.
5749 ArrayRef<Metadata *> Properties) {
5750 // Nothing to do if no property to attach.
5751 if (Properties.empty())
5752 return;
5753
5754 LLVMContext &Ctx = BB->getContext();
5755 SmallVector<Metadata *> NewProperties;
5756 NewProperties.push_back(nullptr);
5757
5758 // If the basic block already has metadata, prepend it to the new metadata.
5759 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5760 if (Existing)
5761 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5762
5763 append_range(NewProperties, Properties);
5764 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5765 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5766
5767 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5768}
5769
5770/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5771/// loop already has metadata, the loop properties are appended.
5772static void addLoopMetadata(CanonicalLoopInfo *Loop,
5773 ArrayRef<Metadata *> Properties) {
5774 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5775
5776 // Attach metadata to the loop's latch
5777 BasicBlock *Latch = Loop->getLatch();
5778 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5779 addBasicBlockMetadata(Latch, Properties);
5780}
5781
5782/// Attach llvm.access.group metadata to the memref instructions of \p Block
5783static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5784 LoopInfo &LI) {
5785 for (Instruction &I : *Block) {
5786 if (I.mayReadOrWriteMemory()) {
5787 // TODO: This instruction may already have access group from
5788 // other pragmas e.g. #pragma clang loop vectorize. Append
5789 // so that the existing metadata is not overwritten.
5790 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5791 }
5792 }
5793}
5794
5795void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5796 LLVMContext &Ctx = Builder.getContext();
5798 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5799 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5800}
5801
5802void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5803 LLVMContext &Ctx = Builder.getContext();
5805 Loop, {
5806 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5807 });
5808}
5809
5810void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5811 Value *IfCond, ValueToValueMapTy &VMap,
5812 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5813 const Twine &NamePrefix) {
5814 Function *F = CanonicalLoop->getFunction();
5815
5816 // We can't do
5817 // if (cond) {
5818 // simd_loop;
5819 // } else {
5820 // non_simd_loop;
5821 // }
5822 // because then the CanonicalLoopInfo would only point to one of the loops:
5823 // leading to other constructs operating on the same loop to malfunction.
5824 // Instead generate
5825 // while (...) {
5826 // if (cond) {
5827 // simd_body;
5828 // } else {
5829 // not_simd_body;
5830 // }
5831 // }
5832 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5833 // body at -O3
5834
5835 // Define where if branch should be inserted
5836 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5837
5838 // Create additional blocks for the if statement
5839 BasicBlock *Cond = SplitBeforeIt->getParent();
5840 llvm::LLVMContext &C = Cond->getContext();
5842 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5844 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5845
5846 // Create if condition branch.
5847 Builder.SetInsertPoint(SplitBeforeIt);
5848 Instruction *BrInstr =
5849 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5850 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5851 // Then block contains branch to omp loop body which needs to be vectorized
5852 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5853 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5854
5855 Builder.SetInsertPoint(ElseBlock);
5856
5857 // Clone loop for the else branch
5859
5860 SmallVector<BasicBlock *, 8> ExistingBlocks;
5861 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5862 ExistingBlocks.push_back(ThenBlock);
5863 ExistingBlocks.append(L->block_begin(), L->block_end());
5864 // Cond is the block that has the if clause condition
5865 // LoopCond is omp_loop.cond
5866 // LoopHeader is omp_loop.header
5867 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5868 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5869 assert(LoopCond && LoopHeader && "Invalid loop structure");
5870 for (BasicBlock *Block : ExistingBlocks) {
5871 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5872 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5873 continue;
5874 }
5875 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5876
5877 // fix name not to be omp.if.then
5878 if (Block == ThenBlock)
5879 NewBB->setName(NamePrefix + ".if.else");
5880
5881 NewBB->moveBefore(CanonicalLoop->getExit());
5882 VMap[Block] = NewBB;
5883 NewBlocks.push_back(NewBB);
5884 }
5885 remapInstructionsInBlocks(NewBlocks, VMap);
5886 Builder.CreateBr(NewBlocks.front());
5887
5888 // The loop latch must have only one predecessor. Currently it is branched to
5889 // from both the 'then' and 'else' branches.
5890 L->getLoopLatch()->splitBasicBlock(
5891 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5892
5893 // Ensure that the then block is added to the loop so we add the attributes in
5894 // the next step
5895 L->addBasicBlockToLoop(ThenBlock, LI);
5896}
5897
5898unsigned
5899OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5900 const StringMap<bool> &Features) {
5901 if (TargetTriple.isX86()) {
5902 if (Features.lookup("avx512f"))
5903 return 512;
5904 else if (Features.lookup("avx"))
5905 return 256;
5906 return 128;
5907 }
5908 if (TargetTriple.isPPC())
5909 return 128;
5910 if (TargetTriple.isWasm())
5911 return 128;
5912 return 0;
5913}
5914
5915void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5916 MapVector<Value *, Value *> AlignedVars,
5917 Value *IfCond, OrderKind Order,
5918 ConstantInt *Simdlen, ConstantInt *Safelen) {
5919 LLVMContext &Ctx = Builder.getContext();
5920
5921 Function *F = CanonicalLoop->getFunction();
5922
5923 // TODO: We should not rely on pass manager. Currently we use pass manager
5924 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5925 // object. We should have a method which returns all blocks between
5926 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5928 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5929 FAM.registerPass([]() { return LoopAnalysis(); });
5930 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5931
5932 LoopAnalysis LIA;
5933 LoopInfo &&LI = LIA.run(*F, FAM);
5934
5935 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5936 if (AlignedVars.size()) {
5937 InsertPointTy IP = Builder.saveIP();
5938 for (auto &AlignedItem : AlignedVars) {
5939 Value *AlignedPtr = AlignedItem.first;
5940 Value *Alignment = AlignedItem.second;
5941 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5942 Builder.SetInsertPoint(loadInst->getNextNode());
5943 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5944 Alignment);
5945 }
5946 Builder.restoreIP(IP);
5947 }
5948
5949 if (IfCond) {
5950 ValueToValueMapTy VMap;
5951 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5952 }
5953
5955
5956 // Get the basic blocks from the loop in which memref instructions
5957 // can be found.
5958 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5959 // preferably without running any passes.
5960 for (BasicBlock *Block : L->getBlocks()) {
5961 if (Block == CanonicalLoop->getCond() ||
5962 Block == CanonicalLoop->getHeader())
5963 continue;
5964 Reachable.insert(Block);
5965 }
5966
5967 SmallVector<Metadata *> LoopMDList;
5968
5969 // In presence of finite 'safelen', it may be unsafe to mark all
5970 // the memory instructions parallel, because loop-carried
5971 // dependences of 'safelen' iterations are possible.
5972 // If clause order(concurrent) is specified then the memory instructions
5973 // are marked parallel even if 'safelen' is finite.
5974 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5975 // Add access group metadata to memory-access instructions.
5976 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5977 for (BasicBlock *BB : Reachable)
5978 addSimdMetadata(BB, AccessGroup, LI);
5979 // TODO: If the loop has existing parallel access metadata, have
5980 // to combine two lists.
5981 LoopMDList.push_back(MDNode::get(
5982 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5983 }
5984
5985 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5986 // versions so we can't add the loop attributes in that case.
5987 if (IfCond) {
5988 // we can still add llvm.loop.parallel_access
5989 addLoopMetadata(CanonicalLoop, LoopMDList);
5990 return;
5991 }
5992
5993 // Use the above access group metadata to create loop level
5994 // metadata, which should be distinct for each loop.
5995 ConstantAsMetadata *BoolConst =
5997 LoopMDList.push_back(MDNode::get(
5998 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5999
6000 if (Simdlen || Safelen) {
6001 // If both simdlen and safelen clauses are specified, the value of the
6002 // simdlen parameter must be less than or equal to the value of the safelen
6003 // parameter. Therefore, use safelen only in the absence of simdlen.
6004 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6005 LoopMDList.push_back(
6006 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6007 ConstantAsMetadata::get(VectorizeWidth)}));
6008 }
6009
6010 addLoopMetadata(CanonicalLoop, LoopMDList);
6011}
6012
6013/// Create the TargetMachine object to query the backend for optimization
6014/// preferences.
6015///
6016/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6017/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6018/// needed for the LLVM pass pipline. We use some default options to avoid
6019/// having to pass too many settings from the frontend that probably do not
6020/// matter.
6021///
6022/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6023/// method. If we are going to use TargetMachine for more purposes, especially
6024/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6025/// might become be worth requiring front-ends to pass on their TargetMachine,
6026/// or at least cache it between methods. Note that while fontends such as Clang
6027/// have just a single main TargetMachine per translation unit, "target-cpu" and
6028/// "target-features" that determine the TargetMachine are per-function and can
6029/// be overrided using __attribute__((target("OPTIONS"))).
6030static std::unique_ptr<TargetMachine>
6032 Module *M = F->getParent();
6033
6034 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6035 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6036 const llvm::Triple &Triple = M->getTargetTriple();
6037
6038 std::string Error;
6040 if (!TheTarget)
6041 return {};
6042
6044 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6045 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6046 /*CodeModel=*/std::nullopt, OptLevel));
6047}
6048
6049/// Heuristically determine the best-performant unroll factor for \p CLI. This
6050/// depends on the target processor. We are re-using the same heuristics as the
6051/// LoopUnrollPass.
6052static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6053 Function *F = CLI->getFunction();
6054
6055 // Assume the user requests the most aggressive unrolling, even if the rest of
6056 // the code is optimized using a lower setting.
6058 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6059
6061 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6062 FAM.registerPass([]() { return AssumptionAnalysis(); });
6063 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6064 FAM.registerPass([]() { return LoopAnalysis(); });
6065 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6066 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6067 TargetIRAnalysis TIRA;
6068 if (TM)
6069 TIRA = TargetIRAnalysis(
6070 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6071 FAM.registerPass([&]() { return TIRA; });
6072
6073 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6075 ScalarEvolution &&SE = SEA.run(*F, FAM);
6077 DominatorTree &&DT = DTA.run(*F, FAM);
6078 LoopAnalysis LIA;
6079 LoopInfo &&LI = LIA.run(*F, FAM);
6081 AssumptionCache &&AC = ACT.run(*F, FAM);
6083
6084 Loop *L = LI.getLoopFor(CLI->getHeader());
6085 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6086
6088 L, SE, TTI,
6089 /*BlockFrequencyInfo=*/nullptr,
6090 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6091 /*UserThreshold=*/std::nullopt,
6092 /*UserCount=*/std::nullopt,
6093 /*UserAllowPartial=*/true,
6094 /*UserAllowRuntime=*/true,
6095 /*UserUpperBound=*/std::nullopt,
6096 /*UserFullUnrollMaxCount=*/std::nullopt);
6097
6098 UP.Force = true;
6099
6100 // Account for additional optimizations taking place before the LoopUnrollPass
6101 // would unroll the loop.
6104
6105 // Use normal unroll factors even if the rest of the code is optimized for
6106 // size.
6109
6110 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6111 << " Threshold=" << UP.Threshold << "\n"
6112 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6113 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6114 << " PartialOptSizeThreshold="
6115 << UP.PartialOptSizeThreshold << "\n");
6116
6117 // Disable peeling.
6120 /*UserAllowPeeling=*/false,
6121 /*UserAllowProfileBasedPeeling=*/false,
6122 /*UnrollingSpecficValues=*/false);
6123
6125 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6126
6127 // Assume that reads and writes to stack variables can be eliminated by
6128 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6129 // size.
6130 for (BasicBlock *BB : L->blocks()) {
6131 for (Instruction &I : *BB) {
6132 Value *Ptr;
6133 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6134 Ptr = Load->getPointerOperand();
6135 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6136 Ptr = Store->getPointerOperand();
6137 } else
6138 continue;
6139
6140 Ptr = Ptr->stripPointerCasts();
6141
6142 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6143 if (Alloca->getParent() == &F->getEntryBlock())
6144 EphValues.insert(&I);
6145 }
6146 }
6147 }
6148
6149 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6150
6151 // Loop is not unrollable if the loop contains certain instructions.
6152 if (!UCE.canUnroll()) {
6153 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6154 return 1;
6155 }
6156
6157 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6158 << "\n");
6159
6160 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6161 // be able to use it.
6162 int TripCount = 0;
6163 int MaxTripCount = 0;
6164 bool MaxOrZero = false;
6165 unsigned TripMultiple = 0;
6166
6167 bool UseUpperBound = false;
6168 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6169 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6170 UseUpperBound);
6171 unsigned Factor = UP.Count;
6172 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6173
6174 // This function returns 1 to signal to not unroll a loop.
6175 if (Factor == 0)
6176 return 1;
6177 return Factor;
6178}
6179
6180void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6181 int32_t Factor,
6182 CanonicalLoopInfo **UnrolledCLI) {
6183 assert(Factor >= 0 && "Unroll factor must not be negative");
6184
6185 Function *F = Loop->getFunction();
6186 LLVMContext &Ctx = F->getContext();
6187
6188 // If the unrolled loop is not used for another loop-associated directive, it
6189 // is sufficient to add metadata for the LoopUnrollPass.
6190 if (!UnrolledCLI) {
6191 SmallVector<Metadata *, 2> LoopMetadata;
6192 LoopMetadata.push_back(
6193 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6194
6195 if (Factor >= 1) {
6197 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6198 LoopMetadata.push_back(MDNode::get(
6199 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6200 }
6201
6202 addLoopMetadata(Loop, LoopMetadata);
6203 return;
6204 }
6205
6206 // Heuristically determine the unroll factor.
6207 if (Factor == 0)
6209
6210 // No change required with unroll factor 1.
6211 if (Factor == 1) {
6212 *UnrolledCLI = Loop;
6213 return;
6214 }
6215
6216 assert(Factor >= 2 &&
6217 "unrolling only makes sense with a factor of 2 or larger");
6218
6219 Type *IndVarTy = Loop->getIndVarType();
6220
6221 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6222 // unroll the inner loop.
6223 Value *FactorVal =
6224 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6225 /*isSigned=*/false));
6226 std::vector<CanonicalLoopInfo *> LoopNest =
6227 tileLoops(DL, {Loop}, {FactorVal});
6228 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6229 *UnrolledCLI = LoopNest[0];
6230 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6231
6232 // LoopUnrollPass can only fully unroll loops with constant trip count.
6233 // Unroll by the unroll factor with a fallback epilog for the remainder
6234 // iterations if necessary.
6236 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6238 InnerLoop,
6239 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6241 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6242
6243#ifndef NDEBUG
6244 (*UnrolledCLI)->assertOK();
6245#endif
6246}
6247
6248OpenMPIRBuilder::InsertPointTy
6249OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6250 llvm::Value *BufSize, llvm::Value *CpyBuf,
6251 llvm::Value *CpyFn, llvm::Value *DidIt) {
6252 if (!updateToLocation(Loc))
6253 return Loc.IP;
6254
6255 uint32_t SrcLocStrSize;
6256 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6258 Value *ThreadId = getOrCreateThreadID(Ident);
6259
6260 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6261
6262 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6263
6264 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6265 Builder.CreateCall(Fn, Args);
6266
6267 return Builder.saveIP();
6268}
6269
6270OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6271 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6272 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6274
6275 if (!updateToLocation(Loc))
6276 return Loc.IP;
6277
6278 // If needed allocate and initialize `DidIt` with 0.
6279 // DidIt: flag variable: 1=single thread; 0=not single thread.
6280 llvm::Value *DidIt = nullptr;
6281 if (!CPVars.empty()) {
6282 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6283 Builder.CreateStore(Builder.getInt32(0), DidIt);
6284 }
6285
6286 Directive OMPD = Directive::OMPD_single;
6287 uint32_t SrcLocStrSize;
6288 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6289 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6290 Value *ThreadId = getOrCreateThreadID(Ident);
6291 Value *Args[] = {Ident, ThreadId};
6292
6293 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6294 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6295
6296 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6297 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6298
6299 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6300 if (Error Err = FiniCB(IP))
6301 return Err;
6302
6303 // The thread that executes the single region must set `DidIt` to 1.
6304 // This is used by __kmpc_copyprivate, to know if the caller is the
6305 // single thread or not.
6306 if (DidIt)
6307 Builder.CreateStore(Builder.getInt32(1), DidIt);
6308
6309 return Error::success();
6310 };
6311
6312 // generates the following:
6313 // if (__kmpc_single()) {
6314 // .... single region ...
6315 // __kmpc_end_single
6316 // }
6317 // __kmpc_copyprivate
6318 // __kmpc_barrier
6319
6320 InsertPointOrErrorTy AfterIP =
6321 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6322 /*Conditional*/ true,
6323 /*hasFinalize*/ true);
6324 if (!AfterIP)
6325 return AfterIP.takeError();
6326
6327 if (DidIt) {
6328 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6329 // NOTE BufSize is currently unused, so just pass 0.
6330 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6331 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6332 CPFuncs[I], DidIt);
6333 // NOTE __kmpc_copyprivate already inserts a barrier
6334 } else if (!IsNowait) {
6335 InsertPointOrErrorTy AfterIP =
6336 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6337 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6338 /* CheckCancelFlag */ false);
6339 if (!AfterIP)
6340 return AfterIP.takeError();
6341 }
6342 return Builder.saveIP();
6343}
6344
6345OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6346 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6347 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6348
6349 if (!updateToLocation(Loc))
6350 return Loc.IP;
6351
6352 Directive OMPD = Directive::OMPD_critical;
6353 uint32_t SrcLocStrSize;
6354 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6355 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6356 Value *ThreadId = getOrCreateThreadID(Ident);
6357 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6358 Value *Args[] = {Ident, ThreadId, LockVar};
6359
6360 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6361 Function *RTFn = nullptr;
6362 if (HintInst) {
6363 // Add Hint to entry Args and create call
6364 EnterArgs.push_back(HintInst);
6365 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6366 } else {
6367 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6368 }
6369 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6370
6371 Function *ExitRTLFn =
6372 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6373 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6374
6375 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6376 /*Conditional*/ false, /*hasFinalize*/ true);
6377}
6378
6379OpenMPIRBuilder::InsertPointTy
6380OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6381 InsertPointTy AllocaIP, unsigned NumLoops,
6382 ArrayRef<llvm::Value *> StoreValues,
6383 const Twine &Name, bool IsDependSource) {
6384 assert(
6385 llvm::all_of(StoreValues,
6386 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6387 "OpenMP runtime requires depend vec with i64 type");
6388
6389 if (!updateToLocation(Loc))
6390 return Loc.IP;
6391
6392 // Allocate space for vector and generate alloc instruction.
6393 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6394 Builder.restoreIP(AllocaIP);
6395 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6396 ArgsBase->setAlignment(Align(8));
6397 updateToLocation(Loc);
6398
6399 // Store the index value with offset in depend vector.
6400 for (unsigned I = 0; I < NumLoops; ++I) {
6401 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6402 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6403 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6404 STInst->setAlignment(Align(8));
6405 }
6406
6407 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6408 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6409
6410 uint32_t SrcLocStrSize;
6411 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6412 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6413 Value *ThreadId = getOrCreateThreadID(Ident);
6414 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6415
6416 Function *RTLFn = nullptr;
6417 if (IsDependSource)
6418 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6419 else
6420 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6421 Builder.CreateCall(RTLFn, Args);
6422
6423 return Builder.saveIP();
6424}
6425
6426OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6427 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6428 FinalizeCallbackTy FiniCB, bool IsThreads) {
6429 if (!updateToLocation(Loc))
6430 return Loc.IP;
6431
6432 Directive OMPD = Directive::OMPD_ordered;
6433 Instruction *EntryCall = nullptr;
6434 Instruction *ExitCall = nullptr;
6435
6436 if (IsThreads) {
6437 uint32_t SrcLocStrSize;
6438 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6439 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6440 Value *ThreadId = getOrCreateThreadID(Ident);
6441 Value *Args[] = {Ident, ThreadId};
6442
6443 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6444 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6445
6446 Function *ExitRTLFn =
6447 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6448 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6449 }
6450
6451 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6452 /*Conditional*/ false, /*hasFinalize*/ true);
6453}
6454
6455OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6456 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6457 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6458 bool HasFinalize, bool IsCancellable) {
6459
6460 if (HasFinalize)
6461 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6462
6463 // Create inlined region's entry and body blocks, in preparation
6464 // for conditional creation
6465 BasicBlock *EntryBB = Builder.GetInsertBlock();
6466 Instruction *SplitPos = EntryBB->getTerminator();
6467 if (!isa_and_nonnull<BranchInst>(SplitPos))
6468 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6469 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6470 BasicBlock *FiniBB =
6471 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6472
6473 Builder.SetInsertPoint(EntryBB->getTerminator());
6474 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6475
6476 // generate body
6477 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6478 /* CodeGenIP */ Builder.saveIP()))
6479 return Err;
6480
6481 // emit exit call and do any needed finalization.
6482 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6483 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6484 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6485 "Unexpected control flow graph state!!");
6486 InsertPointOrErrorTy AfterIP =
6487 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6488 if (!AfterIP)
6489 return AfterIP.takeError();
6490 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6491 "Unexpected Control Flow State!");
6493
6494 // If we are skipping the region of a non conditional, remove the exit
6495 // block, and clear the builder's insertion point.
6496 assert(SplitPos->getParent() == ExitBB &&
6497 "Unexpected Insertion point location!");
6498 auto merged = MergeBlockIntoPredecessor(ExitBB);
6499 BasicBlock *ExitPredBB = SplitPos->getParent();
6500 auto InsertBB = merged ? ExitPredBB : ExitBB;
6501 if (!isa_and_nonnull<BranchInst>(SplitPos))
6502 SplitPos->eraseFromParent();
6503 Builder.SetInsertPoint(InsertBB);
6504
6505 return Builder.saveIP();
6506}
6507
6508OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6509 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6510 // if nothing to do, Return current insertion point.
6511 if (!Conditional || !EntryCall)
6512 return Builder.saveIP();
6513
6514 BasicBlock *EntryBB = Builder.GetInsertBlock();
6515 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6516 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6517 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6518
6519 // Emit thenBB and set the Builder's insertion point there for
6520 // body generation next. Place the block after the current block.
6521 Function *CurFn = EntryBB->getParent();
6522 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6523
6524 // Move Entry branch to end of ThenBB, and replace with conditional
6525 // branch (If-stmt)
6526 Instruction *EntryBBTI = EntryBB->getTerminator();
6527 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6528 EntryBBTI->removeFromParent();
6529 Builder.SetInsertPoint(UI);
6530 Builder.Insert(EntryBBTI);
6531 UI->eraseFromParent();
6532 Builder.SetInsertPoint(ThenBB->getTerminator());
6533
6534 // return an insertion point to ExitBB.
6535 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6536}
6537
6538OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6539 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6540 bool HasFinalize) {
6541
6542 Builder.restoreIP(FinIP);
6543
6544 // If there is finalization to do, emit it before the exit call
6545 if (HasFinalize) {
6546 assert(!FinalizationStack.empty() &&
6547 "Unexpected finalization stack state!");
6548
6549 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6550 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6551
6552 if (Error Err = Fi.FiniCB(FinIP))
6553 return Err;
6554
6555 BasicBlock *FiniBB = FinIP.getBlock();
6556 Instruction *FiniBBTI = FiniBB->getTerminator();
6557
6558 // set Builder IP for call creation
6559 Builder.SetInsertPoint(FiniBBTI);
6560 }
6561
6562 if (!ExitCall)
6563 return Builder.saveIP();
6564
6565 // place the Exitcall as last instruction before Finalization block terminator
6566 ExitCall->removeFromParent();
6567 Builder.Insert(ExitCall);
6568
6569 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6570 ExitCall->getIterator());
6571}
6572
6573OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6574 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6575 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6576 if (!IP.isSet())
6577 return IP;
6578
6579 IRBuilder<>::InsertPointGuard IPG(Builder);
6580
6581 // creates the following CFG structure
6582 // OMP_Entry : (MasterAddr != PrivateAddr)?
6583 // F T
6584 // | \
6585 // | copin.not.master
6586 // | /
6587 // v /
6588 // copyin.not.master.end
6589 // |
6590 // v
6591 // OMP.Entry.Next
6592
6593 BasicBlock *OMP_Entry = IP.getBlock();
6594 Function *CurFn = OMP_Entry->getParent();
6595 BasicBlock *CopyBegin =
6596 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6597 BasicBlock *CopyEnd = nullptr;
6598
6599 // If entry block is terminated, split to preserve the branch to following
6600 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6601 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6602 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6603 "copyin.not.master.end");
6604 OMP_Entry->getTerminator()->eraseFromParent();
6605 } else {
6606 CopyEnd =
6607 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6608 }
6609
6610 Builder.SetInsertPoint(OMP_Entry);
6611 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6612 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6613 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6614 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6615
6616 Builder.SetInsertPoint(CopyBegin);
6617 if (BranchtoEnd)
6618 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6619
6620 return Builder.saveIP();
6621}
6622
6623CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6625 std::string Name) {
6626 IRBuilder<>::InsertPointGuard IPG(Builder);
6627 updateToLocation(Loc);
6628
6629 uint32_t SrcLocStrSize;
6630 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6631 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6632 Value *ThreadId = getOrCreateThreadID(Ident);
6633 Value *Args[] = {ThreadId, Size, Allocator};
6634
6635 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6636
6637 return Builder.CreateCall(Fn, Args, Name);
6638}
6639
6640CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6641 Value *Addr, Value *Allocator,
6642 std::string Name) {
6643 IRBuilder<>::InsertPointGuard IPG(Builder);
6644 updateToLocation(Loc);
6645
6646 uint32_t SrcLocStrSize;
6647 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6648 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6649 Value *ThreadId = getOrCreateThreadID(Ident);
6650 Value *Args[] = {ThreadId, Addr, Allocator};
6651 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6652 return Builder.CreateCall(Fn, Args, Name);
6653}
6654
6655CallInst *OpenMPIRBuilder::createOMPInteropInit(
6656 const LocationDescription &Loc, Value *InteropVar,
6657 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6658 Value *DependenceAddress, bool HaveNowaitClause) {
6659 IRBuilder<>::InsertPointGuard IPG(Builder);
6660 updateToLocation(Loc);
6661
6662 uint32_t SrcLocStrSize;
6663 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6664 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6665 Value *ThreadId = getOrCreateThreadID(Ident);
6666 if (Device == nullptr)
6668 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6669 if (NumDependences == nullptr) {
6670 NumDependences = ConstantInt::get(Int32, 0);
6671 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6672 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6673 }
6674 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6675 Value *Args[] = {
6676 Ident, ThreadId, InteropVar, InteropTypeVal,
6677 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6678
6679 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6680
6681 return Builder.CreateCall(Fn, Args);
6682}
6683
6684CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6685 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6686 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6687 IRBuilder<>::InsertPointGuard IPG(Builder);
6688 updateToLocation(Loc);
6689
6690 uint32_t SrcLocStrSize;
6691 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6692 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6693 Value *ThreadId = getOrCreateThreadID(Ident);
6694 if (Device == nullptr)
6696 if (NumDependences == nullptr) {
6697 NumDependences = ConstantInt::get(Int32, 0);
6698 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6699 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6700 }
6701 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6702 Value *Args[] = {
6703 Ident, ThreadId, InteropVar, Device,
6704 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6705
6706 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6707
6708 return Builder.CreateCall(Fn, Args);
6709}
6710
6711CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6712 Value *InteropVar, Value *Device,
6713 Value *NumDependences,
6714 Value *DependenceAddress,
6715 bool HaveNowaitClause) {
6716 IRBuilder<>::InsertPointGuard IPG(Builder);
6717 updateToLocation(Loc);
6718 uint32_t SrcLocStrSize;
6719 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6720 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6721 Value *ThreadId = getOrCreateThreadID(Ident);
6722 if (Device == nullptr)
6724 if (NumDependences == nullptr) {
6725 NumDependences = ConstantInt::get(Int32, 0);
6726 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6727 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6728 }
6729 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6730 Value *Args[] = {
6731 Ident, ThreadId, InteropVar, Device,
6732 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6733
6734 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6735
6736 return Builder.CreateCall(Fn, Args);
6737}
6738
6739CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6740 const LocationDescription &Loc, llvm::Value *Pointer,
6741 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6742 IRBuilder<>::InsertPointGuard IPG(Builder);
6743 updateToLocation(Loc);
6744
6745 uint32_t SrcLocStrSize;
6746 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6747 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6748 Value *ThreadId = getOrCreateThreadID(Ident);
6749 Constant *ThreadPrivateCache =
6750 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6751 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6752
6753 Function *Fn =
6754 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6755
6756 return Builder.CreateCall(Fn, Args);
6757}
6758
6759OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6760 const LocationDescription &Loc,
6761 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6762 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6763 "expected num_threads and num_teams to be specified");
6764
6765 if (!updateToLocation(Loc))
6766 return Loc.IP;
6767
6768 uint32_t SrcLocStrSize;
6769 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6770 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6771 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6772 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6773 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6774 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6775 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6776
6777 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6778 Function *Kernel = DebugKernelWrapper;
6779
6780 // We need to strip the debug prefix to get the correct kernel name.
6781 StringRef KernelName = Kernel->getName();
6782 const std::string DebugPrefix = "_debug__";
6783 if (KernelName.ends_with(DebugPrefix)) {
6784 KernelName = KernelName.drop_back(DebugPrefix.length());
6785 Kernel = M.getFunction(KernelName);
6786 assert(Kernel && "Expected the real kernel to exist");
6787 }
6788
6789 // Manifest the launch configuration in the metadata matching the kernel
6790 // environment.
6791 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6792 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6793
6794 // If MaxThreads not set, select the maximum between the default workgroup
6795 // size and the MinThreads value.
6796 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6797 if (MaxThreadsVal < 0)
6798 MaxThreadsVal = std::max(
6799 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6800
6801 if (MaxThreadsVal > 0)
6802 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6803
6804 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6806 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6807 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6808 Constant *ReductionDataSize =
6809 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6810 Constant *ReductionBufferLength =
6811 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6812
6813 Function *Fn = getOrCreateRuntimeFunctionPtr(
6814 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6815 const DataLayout &DL = Fn->getDataLayout();
6816
6817 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6818 Constant *DynamicEnvironmentInitializer =
6819 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6820 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6821 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6822 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6823 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6824 DL.getDefaultGlobalsAddressSpace());
6825 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6826
6827 Constant *DynamicEnvironment =
6828 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6829 ? DynamicEnvironmentGV
6830 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6831 DynamicEnvironmentPtr);
6832
6833 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6834 ConfigurationEnvironment, {
6835 UseGenericStateMachineVal,
6836 MayUseNestedParallelismVal,
6837 IsSPMDVal,
6838 MinThreads,
6839 MaxThreads,
6840 MinTeams,
6841 MaxTeams,
6842 ReductionDataSize,
6843 ReductionBufferLength,
6844 });
6845 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6846 KernelEnvironment, {
6847 ConfigurationEnvironmentInitializer,
6848 Ident,
6849 DynamicEnvironment,
6850 });
6851 std::string KernelEnvironmentName =
6852 (KernelName + "_kernel_environment").str();
6853 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6854 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6855 KernelEnvironmentInitializer, KernelEnvironmentName,
6856 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6857 DL.getDefaultGlobalsAddressSpace());
6858 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6859
6860 Constant *KernelEnvironment =
6861 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6862 ? KernelEnvironmentGV
6863 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6864 KernelEnvironmentPtr);
6865 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6866 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6867 KernelLaunchEnvironment =
6868 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6869 ? KernelLaunchEnvironment
6870 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6871 KernelLaunchEnvParamTy);
6872 CallInst *ThreadKind =
6873 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6874
6875 Value *ExecUserCode = Builder.CreateICmpEQ(
6876 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6877 "exec_user_code");
6878
6879 // ThreadKind = __kmpc_target_init(...)
6880 // if (ThreadKind == -1)
6881 // user_code
6882 // else
6883 // return;
6884
6885 auto *UI = Builder.CreateUnreachable();
6886 BasicBlock *CheckBB = UI->getParent();
6887 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6888
6889 BasicBlock *WorkerExitBB = BasicBlock::Create(
6890 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6891 Builder.SetInsertPoint(WorkerExitBB);
6892 Builder.CreateRetVoid();
6893
6894 auto *CheckBBTI = CheckBB->getTerminator();
6895 Builder.SetInsertPoint(CheckBBTI);
6896 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6897
6898 CheckBBTI->eraseFromParent();
6899 UI->eraseFromParent();
6900
6901 // Continue in the "user_code" block, see diagram above and in
6902 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6903 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6904}
6905
6906void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6907 int32_t TeamsReductionDataSize,
6908 int32_t TeamsReductionBufferLength) {
6909 if (!updateToLocation(Loc))
6910 return;
6911
6912 Function *Fn = getOrCreateRuntimeFunctionPtr(
6913 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6914
6915 Builder.CreateCall(Fn, {});
6916
6917 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6918 return;
6919
6920 Function *Kernel = Builder.GetInsertBlock()->getParent();
6921 // We need to strip the debug prefix to get the correct kernel name.
6922 StringRef KernelName = Kernel->getName();
6923 const std::string DebugPrefix = "_debug__";
6924 if (KernelName.ends_with(DebugPrefix))
6925 KernelName = KernelName.drop_back(DebugPrefix.length());
6926 auto *KernelEnvironmentGV =
6927 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6928 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6929 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6930 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6931 KernelEnvironmentInitializer,
6932 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6933 NewInitializer = ConstantFoldInsertValueInstruction(
6934 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6935 {0, 8});
6936 KernelEnvironmentGV->setInitializer(NewInitializer);
6937}
6938
6939static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6940 bool Min) {
6941 if (Kernel.hasFnAttribute(Name)) {
6942 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6943 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6944 }
6945 Kernel.addFnAttr(Name, llvm::utostr(Value));
6946}
6947
6948std::pair<int32_t, int32_t>
6949OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6950 int32_t ThreadLimit =
6951 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6952
6953 if (T.isAMDGPU()) {
6954 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6955 if (!Attr.isValid() || !Attr.isStringAttribute())
6956 return {0, ThreadLimit};
6957 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6958 int32_t LB, UB;
6959 if (!llvm::to_integer(UBStr, UB, 10))
6960 return {0, ThreadLimit};
6961 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6962 if (!llvm::to_integer(LBStr, LB, 10))
6963 return {0, UB};
6964 return {LB, UB};
6965 }
6966
6967 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6968 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6969 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6970 }
6971 return {0, ThreadLimit};
6972}
6973
6974void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6975 Function &Kernel, int32_t LB,
6976 int32_t UB) {
6977 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6978
6979 if (T.isAMDGPU()) {
6980 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6981 llvm::utostr(LB) + "," + llvm::utostr(UB));
6982 return;
6983 }
6984
6985 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6986}
6987
6988std::pair<int32_t, int32_t>
6989OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6990 // TODO: Read from backend annotations if available.
6991 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6992}
6993
6994void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
6995 int32_t LB, int32_t UB) {
6996 if (T.isNVPTX())
6997 if (UB > 0)
6998 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
6999 if (T.isAMDGPU())
7000 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7001
7002 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7003}
7004
7005void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7006 Function *OutlinedFn) {
7007 if (Config.isTargetDevice()) {
7009 // TODO: Determine if DSO local can be set to true.
7010 OutlinedFn->setDSOLocal(false);
7012 if (T.isAMDGCN())
7014 else if (T.isNVPTX())
7016 else if (T.isSPIRV())
7018 }
7019}
7020
7021Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7022 StringRef EntryFnIDName) {
7023 if (Config.isTargetDevice()) {
7024 assert(OutlinedFn && "The outlined function must exist if embedded");
7025 return OutlinedFn;
7026 }
7027
7028 return new GlobalVariable(
7029 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7030 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7031}
7032
7033Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7034 StringRef EntryFnName) {
7035 if (OutlinedFn)
7036 return OutlinedFn;
7037
7038 assert(!M.getGlobalVariable(EntryFnName, true) &&
7039 "Named kernel already exists?");
7040 return new GlobalVariable(
7041 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7042 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7043}
7044
7045Error OpenMPIRBuilder::emitTargetRegionFunction(
7046 TargetRegionEntryInfo &EntryInfo,
7047 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7048 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7049
7050 SmallString<64> EntryFnName;
7051 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7052
7053 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7054 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7055 if (!CBResult)
7056 return CBResult.takeError();
7057 OutlinedFn = *CBResult;
7058 } else {
7059 OutlinedFn = nullptr;
7060 }
7061
7062 // If this target outline function is not an offload entry, we don't need to
7063 // register it. This may be in the case of a false if clause, or if there are
7064 // no OpenMP targets.
7065 if (!IsOffloadEntry)
7066 return Error::success();
7067
7068 std::string EntryFnIDName =
7069 Config.isTargetDevice()
7070 ? std::string(EntryFnName)
7071 : createPlatformSpecificName({EntryFnName, "region_id"});
7072
7073 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7074 EntryFnName, EntryFnIDName);
7075 return Error::success();
7076}
7077
7078Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7079 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7080 StringRef EntryFnName, StringRef EntryFnIDName) {
7081 if (OutlinedFn)
7082 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7083 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7084 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7085 OffloadInfoManager.registerTargetRegionEntryInfo(
7086 EntryInfo, EntryAddr, OutlinedFnID,
7087 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7088 return OutlinedFnID;
7089}
7090
7091OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7092 const LocationDescription &Loc, InsertPointTy AllocaIP,
7093 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7094 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7095 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7096 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7097 BodyGenTy BodyGenType)>
7098 BodyGenCB,
7099 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7100 if (!updateToLocation(Loc))
7101 return InsertPointTy();
7102
7103 Builder.restoreIP(CodeGenIP);
7104 // Disable TargetData CodeGen on Device pass.
7105 if (Config.IsTargetDevice.value_or(false)) {
7106 if (BodyGenCB) {
7107 InsertPointOrErrorTy AfterIP =
7108 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7109 if (!AfterIP)
7110 return AfterIP.takeError();
7111 Builder.restoreIP(*AfterIP);
7112 }
7113 return Builder.saveIP();
7114 }
7115
7116 bool IsStandAlone = !BodyGenCB;
7117 MapInfosTy *MapInfo;
7118 // Generate the code for the opening of the data environment. Capture all the
7119 // arguments of the runtime call by reference because they are used in the
7120 // closing of the region.
7121 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7122 InsertPointTy CodeGenIP) -> Error {
7123 MapInfo = &GenMapInfoCB(Builder.saveIP());
7124 if (Error Err = emitOffloadingArrays(
7125 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7126 /*IsNonContiguous=*/true, DeviceAddrCB))
7127 return Err;
7128
7129 TargetDataRTArgs RTArgs;
7130 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7131
7132 // Emit the number of elements in the offloading arrays.
7133 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7134
7135 // Source location for the ident struct
7136 if (!SrcLocInfo) {
7137 uint32_t SrcLocStrSize;
7138 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7139 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7140 }
7141
7142 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7143 SrcLocInfo, DeviceID,
7144 PointerNum, RTArgs.BasePointersArray,
7145 RTArgs.PointersArray, RTArgs.SizesArray,
7146 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7147 RTArgs.MappersArray};
7148
7149 if (IsStandAlone) {
7150 assert(MapperFunc && "MapperFunc missing for standalone target data");
7151
7152 auto TaskBodyCB = [&](Value *, Value *,
7154 if (Info.HasNoWait) {
7155 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7159 }
7160
7161 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7162 OffloadingArgs);
7163
7164 if (Info.HasNoWait) {
7165 BasicBlock *OffloadContBlock =
7166 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7167 Function *CurFn = Builder.GetInsertBlock()->getParent();
7168 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7169 Builder.restoreIP(Builder.saveIP());
7170 }
7171 return Error::success();
7172 };
7173
7174 bool RequiresOuterTargetTask = Info.HasNoWait;
7175 if (!RequiresOuterTargetTask)
7176 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7177 /*TargetTaskAllocaIP=*/{}));
7178 else
7179 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7180 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7181 } else {
7182 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7183 omp::OMPRTL___tgt_target_data_begin_mapper);
7184
7185 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7186
7187 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7188 if (isa<AllocaInst>(DeviceMap.second.second)) {
7189 auto *LI =
7190 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7191 Builder.CreateStore(LI, DeviceMap.second.second);
7192 }
7193 }
7194
7195 // If device pointer privatization is required, emit the body of the
7196 // region here. It will have to be duplicated: with and without
7197 // privatization.
7198 InsertPointOrErrorTy AfterIP =
7199 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7200 if (!AfterIP)
7201 return AfterIP.takeError();
7202 Builder.restoreIP(*AfterIP);
7203 }
7204 return Error::success();
7205 };
7206
7207 // If we need device pointer privatization, we need to emit the body of the
7208 // region with no privatization in the 'else' branch of the conditional.
7209 // Otherwise, we don't have to do anything.
7210 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7211 InsertPointTy CodeGenIP) -> Error {
7212 InsertPointOrErrorTy AfterIP =
7213 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7214 if (!AfterIP)
7215 return AfterIP.takeError();
7216 Builder.restoreIP(*AfterIP);
7217 return Error::success();
7218 };
7219
7220 // Generate code for the closing of the data region.
7221 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7222 TargetDataRTArgs RTArgs;
7223 Info.EmitDebug = !MapInfo->Names.empty();
7224 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7225
7226 // Emit the number of elements in the offloading arrays.
7227 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7228
7229 // Source location for the ident struct
7230 if (!SrcLocInfo) {
7231 uint32_t SrcLocStrSize;
7232 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7233 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7234 }
7235
7236 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7237 PointerNum, RTArgs.BasePointersArray,
7238 RTArgs.PointersArray, RTArgs.SizesArray,
7239 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7240 RTArgs.MappersArray};
7241 Function *EndMapperFunc =
7242 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7243
7244 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7245 return Error::success();
7246 };
7247
7248 // We don't have to do anything to close the region if the if clause evaluates
7249 // to false.
7250 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7251 return Error::success();
7252 };
7253
7254 Error Err = [&]() -> Error {
7255 if (BodyGenCB) {
7256 Error Err = [&]() {
7257 if (IfCond)
7258 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7259 return BeginThenGen(AllocaIP, Builder.saveIP());
7260 }();
7261
7262 if (Err)
7263 return Err;
7264
7265 // If we don't require privatization of device pointers, we emit the body
7266 // in between the runtime calls. This avoids duplicating the body code.
7267 InsertPointOrErrorTy AfterIP =
7268 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7269 if (!AfterIP)
7270 return AfterIP.takeError();
7271 restoreIPandDebugLoc(Builder, *AfterIP);
7272
7273 if (IfCond)
7274 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7275 return EndThenGen(AllocaIP, Builder.saveIP());
7276 }
7277 if (IfCond)
7278 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7279 return BeginThenGen(AllocaIP, Builder.saveIP());
7280 }();
7281
7282 if (Err)
7283 return Err;
7284
7285 return Builder.saveIP();
7286}
7287
7289OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7290 bool IsGPUDistribute) {
7291 assert((IVSize == 32 || IVSize == 64) &&
7292 "IV size is not compatible with the omp runtime");
7294 if (IsGPUDistribute)
7295 Name = IVSize == 32
7296 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7297 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7298 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7299 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7300 else
7301 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7302 : omp::OMPRTL___kmpc_for_static_init_4u)
7303 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7304 : omp::OMPRTL___kmpc_for_static_init_8u);
7305
7306 return getOrCreateRuntimeFunction(M, Name);
7307}
7308
7309FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7310 bool IVSigned) {
7311 assert((IVSize == 32 || IVSize == 64) &&
7312 "IV size is not compatible with the omp runtime");
7313 RuntimeFunction Name = IVSize == 32
7314 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7315 : omp::OMPRTL___kmpc_dispatch_init_4u)
7316 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7317 : omp::OMPRTL___kmpc_dispatch_init_8u);
7318
7319 return getOrCreateRuntimeFunction(M, Name);
7320}
7321
7322FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7323 bool IVSigned) {
7324 assert((IVSize == 32 || IVSize == 64) &&
7325 "IV size is not compatible with the omp runtime");
7326 RuntimeFunction Name = IVSize == 32
7327 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7328 : omp::OMPRTL___kmpc_dispatch_next_4u)
7329 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7330 : omp::OMPRTL___kmpc_dispatch_next_8u);
7331
7332 return getOrCreateRuntimeFunction(M, Name);
7333}
7334
7335FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7336 bool IVSigned) {
7337 assert((IVSize == 32 || IVSize == 64) &&
7338 "IV size is not compatible with the omp runtime");
7339 RuntimeFunction Name = IVSize == 32
7340 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7341 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7342 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7343 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7344
7345 return getOrCreateRuntimeFunction(M, Name);
7346}
7347
7348FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7349 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7350}
7351
7353 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7354 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7355
7356 DISubprogram *NewSP = Func->getSubprogram();
7357 if (!NewSP)
7358 return;
7359
7361
7362 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7363 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7364 // Only use cached variable if the arg number matches. This is important
7365 // so that DIVariable created for privatized variables are not discarded.
7366 if (NewVar && (arg == NewVar->getArg()))
7367 return NewVar;
7368
7370 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7371 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7372 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7373 return NewVar;
7374 };
7375
7376 auto UpdateDebugRecord = [&](auto *DR) {
7377 DILocalVariable *OldVar = DR->getVariable();
7378 unsigned ArgNo = 0;
7379 for (auto Loc : DR->location_ops()) {
7380 auto Iter = ValueReplacementMap.find(Loc);
7381 if (Iter != ValueReplacementMap.end()) {
7382 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7383 ArgNo = std::get<1>(Iter->second) + 1;
7384 }
7385 }
7386 if (ArgNo != 0)
7387 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7388 };
7389
7390 // The location and scope of variable intrinsics and records still point to
7391 // the parent function of the target region. Update them.
7392 for (Instruction &I : instructions(Func)) {
7394 "Unexpected debug intrinsic");
7395 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7396 UpdateDebugRecord(&DVR);
7397 }
7398 // An extra argument is passed to the device. Create the debug data for it.
7399 if (OMPBuilder.Config.isTargetDevice()) {
7400 DICompileUnit *CU = NewSP->getUnit();
7401 Module *M = Func->getParent();
7402 DIBuilder DB(*M, true, CU);
7403 DIType *VoidPtrTy =
7404 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7405 DILocalVariable *Var = DB.createParameterVariable(
7406 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7407 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7408 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7409 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7410 &(*Func->begin()));
7411 }
7412}
7413
7415 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7416 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7417 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7418 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7419 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7420 SmallVector<Type *> ParameterTypes;
7421 if (OMPBuilder.Config.isTargetDevice()) {
7422 // Add the "implicit" runtime argument we use to provide launch specific
7423 // information for target devices.
7424 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7425 ParameterTypes.push_back(Int8PtrTy);
7426
7427 // All parameters to target devices are passed as pointers
7428 // or i64. This assumes 64-bit address spaces/pointers.
7429 for (auto &Arg : Inputs)
7430 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7431 ? Arg->getType()
7432 : Type::getInt64Ty(Builder.getContext()));
7433 } else {
7434 for (auto &Arg : Inputs)
7435 ParameterTypes.push_back(Arg->getType());
7436 }
7437
7438 auto BB = Builder.GetInsertBlock();
7439 auto M = BB->getModule();
7440 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7441 /*isVarArg*/ false);
7442 auto Func =
7443 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7444
7445 // Forward target-cpu and target-features function attributes from the
7446 // original function to the new outlined function.
7447 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7448
7449 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7450 if (TargetCpuAttr.isStringAttribute())
7451 Func->addFnAttr(TargetCpuAttr);
7452
7453 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7454 if (TargetFeaturesAttr.isStringAttribute())
7455 Func->addFnAttr(TargetFeaturesAttr);
7456
7457 if (OMPBuilder.Config.isTargetDevice()) {
7458 Value *ExecMode =
7459 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7460 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7461 }
7462
7463 // Save insert point.
7464 IRBuilder<>::InsertPointGuard IPG(Builder);
7465 // We will generate the entries in the outlined function but the debug
7466 // location may still be pointing to the parent function. Reset it now.
7467 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7468
7469 // Generate the region into the function.
7470 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7471 Builder.SetInsertPoint(EntryBB);
7472
7473 // Insert target init call in the device compilation pass.
7474 if (OMPBuilder.Config.isTargetDevice())
7475 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7476
7477 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7478
7479 // As we embed the user code in the middle of our target region after we
7480 // generate entry code, we must move what allocas we can into the entry
7481 // block to avoid possible breaking optimisations for device
7482 if (OMPBuilder.Config.isTargetDevice())
7483 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7484
7485 // Insert target deinit call in the device compilation pass.
7486 BasicBlock *OutlinedBodyBB =
7487 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7488 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7489 Builder.saveIP(),
7490 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7491 if (!AfterIP)
7492 return AfterIP.takeError();
7493 Builder.restoreIP(*AfterIP);
7494 if (OMPBuilder.Config.isTargetDevice())
7495 OMPBuilder.createTargetDeinit(Builder);
7496
7497 // Insert return instruction.
7498 Builder.CreateRetVoid();
7499
7500 // New Alloca IP at entry point of created device function.
7501 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7502 auto AllocaIP = Builder.saveIP();
7503
7504 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7505
7506 // Skip the artificial dyn_ptr on the device.
7507 const auto &ArgRange =
7508 OMPBuilder.Config.isTargetDevice()
7509 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7510 : Func->args();
7511
7513
7514 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7515 // Things like GEP's can come in the form of Constants. Constants and
7516 // ConstantExpr's do not have access to the knowledge of what they're
7517 // contained in, so we must dig a little to find an instruction so we
7518 // can tell if they're used inside of the function we're outlining. We
7519 // also replace the original constant expression with a new instruction
7520 // equivalent; an instruction as it allows easy modification in the
7521 // following loop, as we can now know the constant (instruction) is
7522 // owned by our target function and replaceUsesOfWith can now be invoked
7523 // on it (cannot do this with constants it seems). A brand new one also
7524 // allows us to be cautious as it is perhaps possible the old expression
7525 // was used inside of the function but exists and is used externally
7526 // (unlikely by the nature of a Constant, but still).
7527 // NOTE: We cannot remove dead constants that have been rewritten to
7528 // instructions at this stage, we run the risk of breaking later lowering
7529 // by doing so as we could still be in the process of lowering the module
7530 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7531 // constants we have created rewritten versions of.
7532 if (auto *Const = dyn_cast<Constant>(Input))
7533 convertUsersOfConstantsToInstructions(Const, Func, false);
7534
7535 // Collect users before iterating over them to avoid invalidating the
7536 // iteration in case a user uses Input more than once (e.g. a call
7537 // instruction).
7538 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7539 // Collect all the instructions
7541 if (auto *Instr = dyn_cast<Instruction>(User))
7542 if (Instr->getFunction() == Func)
7543 Instr->replaceUsesOfWith(Input, InputCopy);
7544 };
7545
7546 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7547
7548 // Rewrite uses of input valus to parameters.
7549 for (auto InArg : zip(Inputs, ArgRange)) {
7550 Value *Input = std::get<0>(InArg);
7551 Argument &Arg = std::get<1>(InArg);
7552 Value *InputCopy = nullptr;
7553
7554 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7555 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7556 if (!AfterIP)
7557 return AfterIP.takeError();
7558 Builder.restoreIP(*AfterIP);
7559 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7560
7561 // In certain cases a Global may be set up for replacement, however, this
7562 // Global may be used in multiple arguments to the kernel, just segmented
7563 // apart, for example, if we have a global array, that is sectioned into
7564 // multiple mappings (technically not legal in OpenMP, but there is a case
7565 // in Fortran for Common Blocks where this is neccesary), we will end up
7566 // with GEP's into this array inside the kernel, that refer to the Global
7567 // but are technically seperate arguments to the kernel for all intents and
7568 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7569 // index, it will fold into an referal to the Global, if we then encounter
7570 // this folded GEP during replacement all of the references to the
7571 // Global in the kernel will be replaced with the argument we have generated
7572 // that corresponds to it, including any other GEP's that refer to the
7573 // Global that may be other arguments. This will invalidate all of the other
7574 // preceding mapped arguments that refer to the same global that may be
7575 // seperate segments. To prevent this, we defer global processing until all
7576 // other processing has been performed.
7577 if (isa<GlobalValue>(Input)) {
7578 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7579 continue;
7580 }
7581
7583 continue;
7584
7585 ReplaceValue(Input, InputCopy, Func);
7586 }
7587
7588 // Replace all of our deferred Input values, currently just Globals.
7589 for (auto Deferred : DeferredReplacement)
7590 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7591
7592 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7593 ValueReplacementMap);
7594 return Func;
7595}
7596/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7597/// of pointers containing shared data between the parent task and the created
7598/// task.
7599static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7600 IRBuilderBase &Builder,
7601 Value *TaskWithPrivates,
7602 Type *TaskWithPrivatesTy) {
7603
7604 Type *TaskTy = OMPIRBuilder.Task;
7605 LLVMContext &Ctx = Builder.getContext();
7606 Value *TaskT =
7607 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7608 Value *Shareds = TaskT;
7609 // TaskWithPrivatesTy can be one of the following
7610 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7611 // %struct.privates }
7612 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7613 //
7614 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7615 // its first member has to be the task descriptor. TaskTy is the type of the
7616 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7617 // first member of TaskT, gives us the pointer to shared data.
7618 if (TaskWithPrivatesTy != TaskTy)
7619 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7620 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7621}
7622/// Create an entry point for a target task with the following.
7623/// It'll have the following signature
7624/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7625/// This function is called from emitTargetTask once the
7626/// code to launch the target kernel has been outlined already.
7627/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7628/// into the task structure so that the deferred target task can access this
7629/// data even after the stack frame of the generating task has been rolled
7630/// back. Offloading arrays contain base pointers, pointers, sizes etc
7631/// of the data that the target kernel will access. These in effect are the
7632/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7634 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7635 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7636 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7637
7638 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7639 // This is because PrivatesTy is the type of the structure in which
7640 // we pass the offloading arrays to the deferred target task.
7641 assert((!NumOffloadingArrays || PrivatesTy) &&
7642 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7643 "to privatize");
7644
7645 Module &M = OMPBuilder.M;
7646 // KernelLaunchFunction is the target launch function, i.e.
7647 // the function that sets up kernel arguments and calls
7648 // __tgt_target_kernel to launch the kernel on the device.
7649 //
7650 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7651
7652 // StaleCI is the CallInst which is the call to the outlined
7653 // target kernel launch function. If there are local live-in values
7654 // that the outlined function uses then these are aggregated into a structure
7655 // which is passed as the second argument. If there are no local live-in
7656 // values or if all values used by the outlined kernel are global variables,
7657 // then there's only one argument, the threadID. So, StaleCI can be
7658 //
7659 // %structArg = alloca { ptr, ptr }, align 8
7660 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7661 // store ptr %20, ptr %gep_, align 8
7662 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7663 // store ptr %21, ptr %gep_8, align 8
7664 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7665 //
7666 // OR
7667 //
7668 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7669 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7670 StaleCI->getIterator());
7671
7672 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7673
7674 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7675 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7676 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7677
7678 auto ProxyFnTy =
7679 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7680 /* isVarArg */ false);
7681 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7682 ".omp_target_task_proxy_func",
7683 Builder.GetInsertBlock()->getModule());
7684 Value *ThreadId = ProxyFn->getArg(0);
7685 Value *TaskWithPrivates = ProxyFn->getArg(1);
7686 ThreadId->setName("thread.id");
7687 TaskWithPrivates->setName("task");
7688
7689 bool HasShareds = SharedArgsOperandNo > 0;
7690 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7691 BasicBlock *EntryBB =
7692 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7693 Builder.SetInsertPoint(EntryBB);
7694
7695 SmallVector<Value *> KernelLaunchArgs;
7696 KernelLaunchArgs.reserve(StaleCI->arg_size());
7697 KernelLaunchArgs.push_back(ThreadId);
7698
7699 if (HasOffloadingArrays) {
7700 assert(TaskTy != TaskWithPrivatesTy &&
7701 "If there are offloading arrays to pass to the target"
7702 "TaskTy cannot be the same as TaskWithPrivatesTy");
7703 (void)TaskTy;
7704 Value *Privates =
7705 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7706 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7707 KernelLaunchArgs.push_back(
7708 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7709 }
7710
7711 if (HasShareds) {
7712 auto *ArgStructAlloca =
7713 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7714 assert(ArgStructAlloca &&
7715 "Unable to find the alloca instruction corresponding to arguments "
7716 "for extracted function");
7717 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7718
7719 AllocaInst *NewArgStructAlloca =
7720 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7721
7722 Value *SharedsSize =
7723 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7724
7726 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7727
7728 Builder.CreateMemCpy(
7729 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7730 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7731 KernelLaunchArgs.push_back(NewArgStructAlloca);
7732 }
7733 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7734 Builder.CreateRetVoid();
7735 return ProxyFn;
7736}
7738
7739 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7740 return GEP->getSourceElementType();
7741 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7742 return Alloca->getAllocatedType();
7743
7744 llvm_unreachable("Unhandled Instruction type");
7745 return nullptr;
7746}
7747// This function returns a struct that has at most two members.
7748// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7749// descriptor. The second member, if needed, is a struct containing arrays
7750// that need to be passed to the offloaded target kernel. For example,
7751// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7752// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7753// respectively, then the types created by this function are
7754//
7755// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7756// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7757// %struct.privates }
7758// %struct.task_with_privates is returned by this function.
7759// If there aren't any offloading arrays to pass to the target kernel,
7760// %struct.kmp_task_ompbuilder_t is returned.
7761static StructType *
7762createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7763 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7764
7765 if (OffloadingArraysToPrivatize.empty())
7766 return OMPIRBuilder.Task;
7767
7768 SmallVector<Type *, 4> StructFieldTypes;
7769 for (Value *V : OffloadingArraysToPrivatize) {
7770 assert(V->getType()->isPointerTy() &&
7771 "Expected pointer to array to privatize. Got a non-pointer value "
7772 "instead");
7773 Type *ArrayTy = getOffloadingArrayType(V);
7774 assert(ArrayTy && "ArrayType cannot be nullptr");
7775 StructFieldTypes.push_back(ArrayTy);
7776 }
7777 StructType *PrivatesStructTy =
7778 StructType::create(StructFieldTypes, "struct.privates");
7779 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7780 "struct.task_with_privates");
7781}
7783 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7784 TargetRegionEntryInfo &EntryInfo,
7785 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7786 Function *&OutlinedFn, Constant *&OutlinedFnID,
7788 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7789 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7790
7791 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7792 [&](StringRef EntryFnName) {
7793 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7794 EntryFnName, Inputs, CBFunc,
7795 ArgAccessorFuncCB);
7796 };
7797
7798 return OMPBuilder.emitTargetRegionFunction(
7799 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7800 OutlinedFnID);
7801}
7802
7803OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7804 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7805 OpenMPIRBuilder::InsertPointTy AllocaIP,
7807 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7808
7809 // The following explains the code-gen scenario for the `target` directive. A
7810 // similar scneario is followed for other device-related directives (e.g.
7811 // `target enter data`) but in similar fashion since we only need to emit task
7812 // that encapsulates the proper runtime call.
7813 //
7814 // When we arrive at this function, the target region itself has been
7815 // outlined into the function OutlinedFn.
7816 // So at ths point, for
7817 // --------------------------------------------------------------
7818 // void user_code_that_offloads(...) {
7819 // omp target depend(..) map(from:a) map(to:b) private(i)
7820 // do i = 1, 10
7821 // a(i) = b(i) + n
7822 // }
7823 //
7824 // --------------------------------------------------------------
7825 //
7826 // we have
7827 //
7828 // --------------------------------------------------------------
7829 //
7830 // void user_code_that_offloads(...) {
7831 // %.offload_baseptrs = alloca [2 x ptr], align 8
7832 // %.offload_ptrs = alloca [2 x ptr], align 8
7833 // %.offload_mappers = alloca [2 x ptr], align 8
7834 // ;; target region has been outlined and now we need to
7835 // ;; offload to it via a target task.
7836 // }
7837 // void outlined_device_function(ptr a, ptr b, ptr n) {
7838 // n = *n_ptr;
7839 // do i = 1, 10
7840 // a(i) = b(i) + n
7841 // }
7842 //
7843 // We have to now do the following
7844 // (i) Make an offloading call to outlined_device_function using the OpenMP
7845 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7846 // emitted by emitKernelLaunch
7847 // (ii) Create a task entry point function that calls kernel_launch_function
7848 // and is the entry point for the target task. See
7849 // '@.omp_target_task_proxy_func in the pseudocode below.
7850 // (iii) Create a task with the task entry point created in (ii)
7851 //
7852 // That is we create the following
7853 // struct task_with_privates {
7854 // struct kmp_task_ompbuilder_t task_struct;
7855 // struct privates {
7856 // [2 x ptr] ; baseptrs
7857 // [2 x ptr] ; ptrs
7858 // [2 x i64] ; sizes
7859 // }
7860 // }
7861 // void user_code_that_offloads(...) {
7862 // %.offload_baseptrs = alloca [2 x ptr], align 8
7863 // %.offload_ptrs = alloca [2 x ptr], align 8
7864 // %.offload_sizes = alloca [2 x i64], align 8
7865 //
7866 // %structArg = alloca { ptr, ptr, ptr }, align 8
7867 // %strucArg[0] = a
7868 // %strucArg[1] = b
7869 // %strucArg[2] = &n
7870 //
7871 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7872 // sizeof(kmp_task_ompbuilder_t),
7873 // sizeof(structArg),
7874 // @.omp_target_task_proxy_func,
7875 // ...)
7876 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7877 // sizeof(structArg))
7878 // memcpy(target_task_with_privates->privates->baseptrs,
7879 // offload_baseptrs, sizeof(offload_baseptrs)
7880 // memcpy(target_task_with_privates->privates->ptrs,
7881 // offload_ptrs, sizeof(offload_ptrs)
7882 // memcpy(target_task_with_privates->privates->sizes,
7883 // offload_sizes, sizeof(offload_sizes)
7884 // dependencies_array = ...
7885 // ;; if nowait not present
7886 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7887 // call @__kmpc_omp_task_begin_if0(...)
7888 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7889 // %target_task_with_privates)
7890 // call @__kmpc_omp_task_complete_if0(...)
7891 // }
7892 //
7893 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7894 // ptr %task) {
7895 // %structArg = alloca {ptr, ptr, ptr}
7896 // %task_ptr = getelementptr(%task, 0, 0)
7897 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7898 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7899 //
7900 // %offloading_arrays = getelementptr(%task, 0, 1)
7901 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7902 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7903 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7904 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7905 // %offload_sizes, %structArg)
7906 // }
7907 //
7908 // We need the proxy function because the signature of the task entry point
7909 // expected by kmpc_omp_task is always the same and will be different from
7910 // that of the kernel_launch function.
7911 //
7912 // kernel_launch_function is generated by emitKernelLaunch and has the
7913 // always_inline attribute. For this example, it'll look like so:
7914 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7915 // %offload_sizes, %structArg) alwaysinline {
7916 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7917 // ; load aggregated data from %structArg
7918 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7919 // ; offload_sizes
7920 // call i32 @__tgt_target_kernel(...,
7921 // outlined_device_function,
7922 // ptr %kernel_args)
7923 // }
7924 // void outlined_device_function(ptr a, ptr b, ptr n) {
7925 // n = *n_ptr;
7926 // do i = 1, 10
7927 // a(i) = b(i) + n
7928 // }
7929 //
7930 BasicBlock *TargetTaskBodyBB =
7931 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7932 BasicBlock *TargetTaskAllocaBB =
7933 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7934
7935 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7936 TargetTaskAllocaBB->begin());
7937 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7938
7939 OutlineInfo OI;
7940 OI.EntryBB = TargetTaskAllocaBB;
7941 OI.OuterAllocaBB = AllocaIP.getBlock();
7942
7943 // Add the thread ID argument.
7945 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7946 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7947
7948 // Generate the task body which will subsequently be outlined.
7949 Builder.restoreIP(TargetTaskBodyIP);
7950 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7951 return Err;
7952
7953 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7954 // it is given. These blocks are enumerated by
7955 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7956 // to be outside the region. In other words, OI.ExitBlock is expected to be
7957 // the start of the region after the outlining. We used to set OI.ExitBlock
7958 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7959 // except when the task body is a single basic block. In that case,
7960 // OI.ExitBlock is set to the single task body block and will get left out of
7961 // the outlining process. So, simply create a new empty block to which we
7962 // uncoditionally branch from where TaskBodyCB left off
7963 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7964 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7965 /*IsFinished=*/true);
7966
7967 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7968 bool NeedsTargetTask = HasNoWait && DeviceID;
7969 if (NeedsTargetTask) {
7970 for (auto *V :
7971 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7972 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7973 RTArgs.SizesArray}) {
7975 OffloadingArraysToPrivatize.push_back(V);
7976 OI.ExcludeArgsFromAggregate.push_back(V);
7977 }
7978 }
7979 }
7980 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7981 DeviceID, OffloadingArraysToPrivatize](
7982 Function &OutlinedFn) mutable {
7983 assert(OutlinedFn.hasOneUse() &&
7984 "there must be a single user for the outlined function");
7985
7986 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7987
7988 // The first argument of StaleCI is always the thread id.
7989 // The next few arguments are the pointers to offloading arrays
7990 // if any. (see OffloadingArraysToPrivatize)
7991 // Finally, all other local values that are live-in into the outlined region
7992 // end up in a structure whose pointer is passed as the last argument. This
7993 // piece of data is passed in the "shared" field of the task structure. So,
7994 // we know we have to pass shareds to the task if the number of arguments is
7995 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
7996 // thread id. Further, for safety, we assert that the number of arguments of
7997 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
7998 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
7999 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8000 assert((!HasShareds ||
8001 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8002 "Wrong number of arguments for StaleCI when shareds are present");
8003 int SharedArgOperandNo =
8004 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8005
8006 StructType *TaskWithPrivatesTy =
8007 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8008 StructType *PrivatesTy = nullptr;
8009
8010 if (!OffloadingArraysToPrivatize.empty())
8011 PrivatesTy =
8012 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8013
8015 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8016 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8017
8018 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8019 << "\n");
8020
8021 Builder.SetInsertPoint(StaleCI);
8022
8023 // Gather the arguments for emitting the runtime call.
8024 uint32_t SrcLocStrSize;
8025 Constant *SrcLocStr =
8026 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8027 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8028
8029 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8030 //
8031 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8032 // the DeviceID to the deferred task and also since
8033 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8034 Function *TaskAllocFn =
8035 !NeedsTargetTask
8036 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8037 : getOrCreateRuntimeFunctionPtr(
8038 OMPRTL___kmpc_omp_target_task_alloc);
8039
8040 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8041 // call.
8042 Value *ThreadID = getOrCreateThreadID(Ident);
8043
8044 // Argument - `sizeof_kmp_task_t` (TaskSize)
8045 // Tasksize refers to the size in bytes of kmp_task_t data structure
8046 // plus any other data to be passed to the target task, if any, which
8047 // is packed into a struct. kmp_task_t and the struct so created are
8048 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8049 Value *TaskSize = Builder.getInt64(
8050 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8051
8052 // Argument - `sizeof_shareds` (SharedsSize)
8053 // SharedsSize refers to the shareds array size in the kmp_task_t data
8054 // structure.
8055 Value *SharedsSize = Builder.getInt64(0);
8056 if (HasShareds) {
8057 auto *ArgStructAlloca =
8058 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8059 assert(ArgStructAlloca &&
8060 "Unable to find the alloca instruction corresponding to arguments "
8061 "for extracted function");
8062 auto *ArgStructType =
8063 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8064 assert(ArgStructType && "Unable to find struct type corresponding to "
8065 "arguments for extracted function");
8066 SharedsSize =
8067 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8068 }
8069
8070 // Argument - `flags`
8071 // Task is tied iff (Flags & 1) == 1.
8072 // Task is untied iff (Flags & 1) == 0.
8073 // Task is final iff (Flags & 2) == 2.
8074 // Task is not final iff (Flags & 2) == 0.
8075 // A target task is not final and is untied.
8076 Value *Flags = Builder.getInt32(0);
8077
8078 // Emit the @__kmpc_omp_task_alloc runtime call
8079 // The runtime call returns a pointer to an area where the task captured
8080 // variables must be copied before the task is run (TaskData)
8081 CallInst *TaskData = nullptr;
8082
8083 SmallVector<llvm::Value *> TaskAllocArgs = {
8084 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8085 /*flags=*/Flags,
8086 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8087 /*task_func=*/ProxyFn};
8088
8089 if (NeedsTargetTask) {
8090 assert(DeviceID && "Expected non-empty device ID.");
8091 TaskAllocArgs.push_back(DeviceID);
8092 }
8093
8094 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8095
8096 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8097 if (HasShareds) {
8098 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8100 *this, Builder, TaskData, TaskWithPrivatesTy);
8101 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8102 SharedsSize);
8103 }
8104 if (!OffloadingArraysToPrivatize.empty()) {
8105 Value *Privates =
8106 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8107 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8108 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8109 [[maybe_unused]] Type *ArrayType =
8110 getOffloadingArrayType(PtrToPrivatize);
8111 assert(ArrayType && "ArrayType cannot be nullptr");
8112
8113 Type *ElementType = PrivatesTy->getElementType(i);
8114 assert(ElementType == ArrayType &&
8115 "ElementType should match ArrayType");
8116 (void)ArrayType;
8117
8118 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8119 Builder.CreateMemCpy(
8120 Dst, Alignment, PtrToPrivatize, Alignment,
8121 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8122 }
8123 }
8124
8125 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8126
8127 // ---------------------------------------------------------------
8128 // V5.2 13.8 target construct
8129 // If the nowait clause is present, execution of the target task
8130 // may be deferred. If the nowait clause is not present, the target task is
8131 // an included task.
8132 // ---------------------------------------------------------------
8133 // The above means that the lack of a nowait on the target construct
8134 // translates to '#pragma omp task if(0)'
8135 if (!NeedsTargetTask) {
8136 if (DepArray) {
8137 Function *TaskWaitFn =
8138 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8139 Builder.CreateCall(
8140 TaskWaitFn,
8141 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8142 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8143 /*dep_list=*/DepArray,
8144 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8145 /*noalias_dep_list=*/
8147 }
8148 // Included task.
8149 Function *TaskBeginFn =
8150 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8151 Function *TaskCompleteFn =
8152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8153 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8154 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8155 CI->setDebugLoc(StaleCI->getDebugLoc());
8156 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8157 } else if (DepArray) {
8158 // HasNoWait - meaning the task may be deferred. Call
8159 // __kmpc_omp_task_with_deps if there are dependencies,
8160 // else call __kmpc_omp_task
8161 Function *TaskFn =
8162 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8163 Builder.CreateCall(
8164 TaskFn,
8165 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8166 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8168 } else {
8169 // Emit the @__kmpc_omp_task runtime call to spawn the task
8170 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8171 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8172 }
8173
8174 StaleCI->eraseFromParent();
8175 for (Instruction *I : llvm::reverse(ToBeDeleted))
8176 I->eraseFromParent();
8177 };
8178 addOutlineInfo(std::move(OI));
8179
8180 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8181 << *(Builder.GetInsertBlock()) << "\n");
8182 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8183 << *(Builder.GetInsertBlock()->getParent()->getParent())
8184 << "\n");
8185 return Builder.saveIP();
8186}
8187
8188Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8189 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8190 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8191 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8192 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8193 if (Error Err =
8194 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8195 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8196 return Err;
8197 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8198 return Error::success();
8199}
8200
8201static void emitTargetCall(
8202 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8203 OpenMPIRBuilder::InsertPointTy AllocaIP,
8204 OpenMPIRBuilder::TargetDataInfo &Info,
8205 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8206 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8207 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8209 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8210 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8212 bool HasNoWait) {
8213 // Generate a function call to the host fallback implementation of the target
8214 // region. This is called by the host when no offload entry was generated for
8215 // the target region and when the offloading call fails at runtime.
8216 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8217 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8218 Builder.restoreIP(IP);
8219 Builder.CreateCall(OutlinedFn, Args);
8220 return Builder.saveIP();
8221 };
8222
8223 bool HasDependencies = Dependencies.size() > 0;
8224 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8225
8226 OpenMPIRBuilder::TargetKernelArgs KArgs;
8227
8228 auto TaskBodyCB =
8229 [&](Value *DeviceID, Value *RTLoc,
8230 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8231 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8232 // produce any.
8233 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8234 // emitKernelLaunch makes the necessary runtime call to offload the
8235 // kernel. We then outline all that code into a separate function
8236 // ('kernel_launch_function' in the pseudo code above). This function is
8237 // then called by the target task proxy function (see
8238 // '@.omp_target_task_proxy_func' in the pseudo code above)
8239 // "@.omp_target_task_proxy_func' is generated by
8240 // emitTargetTaskProxyFunction.
8241 if (OutlinedFnID && DeviceID)
8242 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8243 EmitTargetCallFallbackCB, KArgs,
8244 DeviceID, RTLoc, TargetTaskAllocaIP);
8245
8246 // We only need to do the outlining if `DeviceID` is set to avoid calling
8247 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8248 // generating the `else` branch of an `if` clause.
8249 //
8250 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8251 // In this case, we execute the host implementation directly.
8252 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8253 }());
8254
8255 OMPBuilder.Builder.restoreIP(AfterIP);
8256 return Error::success();
8257 };
8258
8259 auto &&EmitTargetCallElse =
8260 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8261 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8262 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8263 // produce any.
8264 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8265 if (RequiresOuterTargetTask) {
8266 // Arguments that are intended to be directly forwarded to an
8267 // emitKernelLaunch call are pased as nullptr, since
8268 // OutlinedFnID=nullptr results in that call not being done.
8269 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8270 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8271 /*RTLoc=*/nullptr, AllocaIP,
8272 Dependencies, EmptyRTArgs, HasNoWait);
8273 }
8274 return EmitTargetCallFallbackCB(Builder.saveIP());
8275 }());
8276
8277 Builder.restoreIP(AfterIP);
8278 return Error::success();
8279 };
8280
8281 auto &&EmitTargetCallThen =
8282 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8283 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8284 Info.HasNoWait = HasNoWait;
8285 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8286 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8287 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8288 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8289 /*IsNonContiguous=*/true,
8290 /*ForEndCall=*/false))
8291 return Err;
8292
8293 SmallVector<Value *, 3> NumTeamsC;
8294 for (auto [DefaultVal, RuntimeVal] :
8295 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8296 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8297 : Builder.getInt32(DefaultVal));
8298
8299 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8300 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8301 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8302 if (Clause)
8303 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8304 /*isSigned=*/false);
8305 return Clause;
8306 };
8307 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8308 if (Clause)
8309 Result =
8310 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8311 Result, Clause)
8312 : Clause;
8313 };
8314
8315 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8316 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8317 SmallVector<Value *, 3> NumThreadsC;
8318 Value *MaxThreadsClause =
8319 RuntimeAttrs.TeamsThreadLimit.size() == 1
8320 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8321 : nullptr;
8322
8323 for (auto [TeamsVal, TargetVal] : zip_equal(
8324 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8325 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8326 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8327
8328 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8329 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8330
8331 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8332 }
8333
8334 unsigned NumTargetItems = Info.NumberOfPtrs;
8335 // TODO: Use correct device ID
8336 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8337 uint32_t SrcLocStrSize;
8338 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8339 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8340 llvm::omp::IdentFlag(0), 0);
8341
8342 Value *TripCount = RuntimeAttrs.LoopTripCount
8343 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8344 Builder.getInt64Ty(),
8345 /*isSigned=*/false)
8346 : Builder.getInt64(0);
8347
8348 // TODO: Use correct DynCGGroupMem
8349 Value *DynCGGroupMem = Builder.getInt32(0);
8350
8351 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8352 NumTeamsC, NumThreadsC,
8353 DynCGGroupMem, HasNoWait);
8354
8355 // Assume no error was returned because TaskBodyCB and
8356 // EmitTargetCallFallbackCB don't produce any.
8357 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8358 // The presence of certain clauses on the target directive require the
8359 // explicit generation of the target task.
8360 if (RequiresOuterTargetTask)
8361 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8362 Dependencies, KArgs.RTArgs,
8363 Info.HasNoWait);
8364
8365 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8366 EmitTargetCallFallbackCB, KArgs,
8367 DeviceID, RTLoc, AllocaIP);
8368 }());
8369
8370 Builder.restoreIP(AfterIP);
8371 return Error::success();
8372 };
8373
8374 // If we don't have an ID for the target region, it means an offload entry
8375 // wasn't created. In this case we just run the host fallback directly and
8376 // ignore any potential 'if' clauses.
8377 if (!OutlinedFnID) {
8378 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8379 return;
8380 }
8381
8382 // If there's no 'if' clause, only generate the kernel launch code path.
8383 if (!IfCond) {
8384 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8385 return;
8386 }
8387
8388 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8389 EmitTargetCallElse, AllocaIP));
8390}
8391
8392OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8393 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8394 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8395 TargetRegionEntryInfo &EntryInfo,
8396 const TargetKernelDefaultAttrs &DefaultAttrs,
8397 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8398 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8399 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8400 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8401 CustomMapperCallbackTy CustomMapperCB,
8402 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8403
8404 if (!updateToLocation(Loc))
8405 return InsertPointTy();
8406
8407 Builder.restoreIP(CodeGenIP);
8408
8409 Function *OutlinedFn;
8410 Constant *OutlinedFnID = nullptr;
8411 // The target region is outlined into its own function. The LLVM IR for
8412 // the target region itself is generated using the callbacks CBFunc
8413 // and ArgAccessorFuncCB
8415 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8416 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8417 return Err;
8418
8419 // If we are not on the target device, then we need to generate code
8420 // to make a remote call (offload) to the previously outlined function
8421 // that represents the target region. Do that now.
8422 if (!Config.isTargetDevice())
8423 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8424 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8425 CustomMapperCB, Dependencies, HasNowait);
8426 return Builder.saveIP();
8427}
8428
8429std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8430 StringRef FirstSeparator,
8431 StringRef Separator) {
8432 SmallString<128> Buffer;
8433 llvm::raw_svector_ostream OS(Buffer);
8434 StringRef Sep = FirstSeparator;
8435 for (StringRef Part : Parts) {
8436 OS << Sep << Part;
8437 Sep = Separator;
8438 }
8439 return OS.str().str();
8440}
8441
8442std::string
8443OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8444 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8445 Config.separator());
8446}
8447
8449OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8450 unsigned AddressSpace) {
8451 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8452 if (Elem.second) {
8453 assert(Elem.second->getValueType() == Ty &&
8454 "OMP internal variable has different type than requested");
8455 } else {
8456 // TODO: investigate the appropriate linkage type used for the global
8457 // variable for possibly changing that to internal or private, or maybe
8458 // create different versions of the function for different OMP internal
8459 // variables.
8460 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8463 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8464 Constant::getNullValue(Ty), Elem.first(),
8465 /*InsertBefore=*/nullptr,
8467 const DataLayout &DL = M.getDataLayout();
8468 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8469 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8470 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8471 Elem.second = GV;
8472 }
8473
8474 return Elem.second;
8475}
8476
8477Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8478 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8479 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8480 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8481}
8482
8483Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8484 LLVMContext &Ctx = Builder.getContext();
8485 Value *Null =
8487 Value *SizeGep =
8488 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8489 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8490 return SizePtrToInt;
8491}
8492
8494OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8495 std::string VarName) {
8496 llvm::Constant *MaptypesArrayInit =
8497 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8498 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8499 M, MaptypesArrayInit->getType(),
8500 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8501 VarName);
8502 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8503 return MaptypesArrayGlobal;
8504}
8505
8506void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8507 InsertPointTy AllocaIP,
8508 unsigned NumOperands,
8509 struct MapperAllocas &MapperAllocas) {
8510 if (!updateToLocation(Loc))
8511 return;
8512
8513 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8514 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8515 Builder.restoreIP(AllocaIP);
8516 AllocaInst *ArgsBase = Builder.CreateAlloca(
8517 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8518 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8519 ".offload_ptrs");
8520 AllocaInst *ArgSizes = Builder.CreateAlloca(
8521 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8522 updateToLocation(Loc);
8523 MapperAllocas.ArgsBase = ArgsBase;
8524 MapperAllocas.Args = Args;
8525 MapperAllocas.ArgSizes = ArgSizes;
8526}
8527
8528void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8529 Function *MapperFunc, Value *SrcLocInfo,
8530 Value *MaptypesArg, Value *MapnamesArg,
8531 struct MapperAllocas &MapperAllocas,
8532 int64_t DeviceID, unsigned NumOperands) {
8533 if (!updateToLocation(Loc))
8534 return;
8535
8536 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8537 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8538 Value *ArgsBaseGEP =
8539 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8540 {Builder.getInt32(0), Builder.getInt32(0)});
8541 Value *ArgsGEP =
8542 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8543 {Builder.getInt32(0), Builder.getInt32(0)});
8544 Value *ArgSizesGEP =
8545 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8546 {Builder.getInt32(0), Builder.getInt32(0)});
8547 Value *NullPtr =
8548 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8549 Builder.CreateCall(MapperFunc,
8550 {SrcLocInfo, Builder.getInt64(DeviceID),
8551 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8552 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8553}
8554
8555void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8556 TargetDataRTArgs &RTArgs,
8557 TargetDataInfo &Info,
8558 bool ForEndCall) {
8559 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8560 "expected region end call to runtime only when end call is separate");
8561 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8562 auto VoidPtrTy = UnqualPtrTy;
8563 auto VoidPtrPtrTy = UnqualPtrTy;
8564 auto Int64Ty = Type::getInt64Ty(M.getContext());
8565 auto Int64PtrTy = UnqualPtrTy;
8566
8567 if (!Info.NumberOfPtrs) {
8568 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8569 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8570 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8571 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8572 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8573 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8574 return;
8575 }
8576
8577 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8578 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8579 Info.RTArgs.BasePointersArray,
8580 /*Idx0=*/0, /*Idx1=*/0);
8581 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8582 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8583 /*Idx0=*/0,
8584 /*Idx1=*/0);
8585 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8586 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8587 /*Idx0=*/0, /*Idx1=*/0);
8588 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8589 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8590 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8591 : Info.RTArgs.MapTypesArray,
8592 /*Idx0=*/0,
8593 /*Idx1=*/0);
8594
8595 // Only emit the mapper information arrays if debug information is
8596 // requested.
8597 if (!Info.EmitDebug)
8598 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8599 else
8600 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8601 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8602 /*Idx0=*/0,
8603 /*Idx1=*/0);
8604 // If there is no user-defined mapper, set the mapper array to nullptr to
8605 // avoid an unnecessary data privatization
8606 if (!Info.HasMapper)
8607 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8608 else
8609 RTArgs.MappersArray =
8610 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8611}
8612
8613void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8614 InsertPointTy CodeGenIP,
8615 MapInfosTy &CombinedInfo,
8616 TargetDataInfo &Info) {
8617 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8618 CombinedInfo.NonContigInfo;
8619
8620 // Build an array of struct descriptor_dim and then assign it to
8621 // offload_args.
8622 //
8623 // struct descriptor_dim {
8624 // uint64_t offset;
8625 // uint64_t count;
8626 // uint64_t stride
8627 // };
8628 Type *Int64Ty = Builder.getInt64Ty();
8630 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8631 "struct.descriptor_dim");
8632
8633 enum { OffsetFD = 0, CountFD, StrideFD };
8634 // We need two index variable here since the size of "Dims" is the same as
8635 // the size of Components, however, the size of offset, count, and stride is
8636 // equal to the size of base declaration that is non-contiguous.
8637 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8638 // Skip emitting ir if dimension size is 1 since it cannot be
8639 // non-contiguous.
8640 if (NonContigInfo.Dims[I] == 1)
8641 continue;
8642 Builder.restoreIP(AllocaIP);
8643 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8644 AllocaInst *DimsAddr =
8645 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8646 Builder.restoreIP(CodeGenIP);
8647 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8648 unsigned RevIdx = EE - II - 1;
8649 Value *DimsLVal = Builder.CreateInBoundsGEP(
8650 DimsAddr->getAllocatedType(), DimsAddr,
8651 {Builder.getInt64(0), Builder.getInt64(II)});
8652 // Offset
8653 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8654 Builder.CreateAlignedStore(
8655 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8656 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8657 // Count
8658 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8659 Builder.CreateAlignedStore(
8660 NonContigInfo.Counts[L][RevIdx], CountLVal,
8661 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8662 // Stride
8663 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8664 Builder.CreateAlignedStore(
8665 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8666 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8667 }
8668 // args[I] = &dims
8669 Builder.restoreIP(CodeGenIP);
8670 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8671 DimsAddr, Builder.getPtrTy());
8672 Value *P = Builder.CreateConstInBoundsGEP2_32(
8673 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8674 Info.RTArgs.PointersArray, 0, I);
8675 Builder.CreateAlignedStore(
8676 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8677 ++L;
8678 }
8679}
8680
8681void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8682 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8683 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8684 BasicBlock *ExitBB, bool IsInit) {
8685 StringRef Prefix = IsInit ? ".init" : ".del";
8686
8687 // Evaluate if this is an array section.
8689 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8690 Value *IsArray =
8691 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8692 Value *DeleteBit = Builder.CreateAnd(
8693 MapType,
8694 Builder.getInt64(
8695 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8696 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8697 Value *DeleteCond;
8698 Value *Cond;
8699 if (IsInit) {
8700 // base != begin?
8701 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8702 // IsPtrAndObj?
8703 Value *PtrAndObjBit = Builder.CreateAnd(
8704 MapType,
8705 Builder.getInt64(
8706 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8707 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8708 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8709 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8710 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8711 DeleteCond = Builder.CreateIsNull(
8712 DeleteBit,
8713 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8714 } else {
8715 Cond = IsArray;
8716 DeleteCond = Builder.CreateIsNotNull(
8717 DeleteBit,
8718 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8719 }
8720 Cond = Builder.CreateAnd(Cond, DeleteCond);
8721 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8722
8723 emitBlock(BodyBB, MapperFn);
8724 // Get the array size by multiplying element size and element number (i.e., \p
8725 // Size).
8726 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8727 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8728 // memory allocation/deletion purpose only.
8729 Value *MapTypeArg = Builder.CreateAnd(
8730 MapType,
8731 Builder.getInt64(
8732 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8733 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8734 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8735 MapTypeArg = Builder.CreateOr(
8736 MapTypeArg,
8737 Builder.getInt64(
8738 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8739 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8740
8741 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8742 // data structure.
8743 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8744 ArraySize, MapTypeArg, MapName};
8745 Builder.CreateCall(
8746 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8747 OffloadingArgs);
8748}
8749
8750Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8751 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8752 llvm::Value *BeginArg)>
8753 GenMapInfoCB,
8754 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8755 SmallVector<Type *> Params;
8756 Params.emplace_back(Builder.getPtrTy());
8757 Params.emplace_back(Builder.getPtrTy());
8758 Params.emplace_back(Builder.getPtrTy());
8759 Params.emplace_back(Builder.getInt64Ty());
8760 Params.emplace_back(Builder.getInt64Ty());
8761 Params.emplace_back(Builder.getPtrTy());
8762
8763 auto *FnTy =
8764 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8765
8766 SmallString<64> TyStr;
8767 raw_svector_ostream Out(TyStr);
8768 Function *MapperFn =
8770 MapperFn->addFnAttr(Attribute::NoInline);
8771 MapperFn->addFnAttr(Attribute::NoUnwind);
8772 MapperFn->addParamAttr(0, Attribute::NoUndef);
8773 MapperFn->addParamAttr(1, Attribute::NoUndef);
8774 MapperFn->addParamAttr(2, Attribute::NoUndef);
8775 MapperFn->addParamAttr(3, Attribute::NoUndef);
8776 MapperFn->addParamAttr(4, Attribute::NoUndef);
8777 MapperFn->addParamAttr(5, Attribute::NoUndef);
8778
8779 // Start the mapper function code generation.
8780 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8781 auto SavedIP = Builder.saveIP();
8782 Builder.SetInsertPoint(EntryBB);
8783
8784 Value *MapperHandle = MapperFn->getArg(0);
8785 Value *BaseIn = MapperFn->getArg(1);
8786 Value *BeginIn = MapperFn->getArg(2);
8787 Value *Size = MapperFn->getArg(3);
8788 Value *MapType = MapperFn->getArg(4);
8789 Value *MapName = MapperFn->getArg(5);
8790
8791 // Compute the starting and end addresses of array elements.
8792 // Prepare common arguments for array initiation and deletion.
8793 // Convert the size in bytes into the number of array elements.
8794 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8795 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8796 Value *PtrBegin = BeginIn;
8797 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8798
8799 // Emit array initiation if this is an array section and \p MapType indicates
8800 // that memory allocation is required.
8801 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8802 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8803 MapType, MapName, ElementSize, HeadBB,
8804 /*IsInit=*/true);
8805
8806 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8807
8808 // Emit the loop header block.
8809 emitBlock(HeadBB, MapperFn);
8810 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8811 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8812 // Evaluate whether the initial condition is satisfied.
8813 Value *IsEmpty =
8814 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8815 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8816
8817 // Emit the loop body block.
8818 emitBlock(BodyBB, MapperFn);
8819 BasicBlock *LastBB = BodyBB;
8820 PHINode *PtrPHI =
8821 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8822 PtrPHI->addIncoming(PtrBegin, HeadBB);
8823
8824 // Get map clause information. Fill up the arrays with all mapped variables.
8825 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8826 if (!Info)
8827 return Info.takeError();
8828
8829 // Call the runtime API __tgt_mapper_num_components to get the number of
8830 // pre-existing components.
8831 Value *OffloadingArgs[] = {MapperHandle};
8832 Value *PreviousSize = Builder.CreateCall(
8833 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8834 OffloadingArgs);
8835 Value *ShiftedPreviousSize =
8836 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8837
8838 // Fill up the runtime mapper handle for all components.
8839 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8840 Value *CurBaseArg = Info->BasePointers[I];
8841 Value *CurBeginArg = Info->Pointers[I];
8842 Value *CurSizeArg = Info->Sizes[I];
8843 Value *CurNameArg = Info->Names.size()
8844 ? Info->Names[I]
8845 : Constant::getNullValue(Builder.getPtrTy());
8846
8847 // Extract the MEMBER_OF field from the map type.
8848 Value *OriMapType = Builder.getInt64(
8849 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8850 Info->Types[I]));
8851 Value *MemberMapType =
8852 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8853
8854 // Combine the map type inherited from user-defined mapper with that
8855 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8856 // bits of the \a MapType, which is the input argument of the mapper
8857 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8858 // bits of MemberMapType.
8859 // [OpenMP 5.0], 1.2.6. map-type decay.
8860 // | alloc | to | from | tofrom | release | delete
8861 // ----------------------------------------------------------
8862 // alloc | alloc | alloc | alloc | alloc | release | delete
8863 // to | alloc | to | alloc | to | release | delete
8864 // from | alloc | alloc | from | from | release | delete
8865 // tofrom | alloc | to | from | tofrom | release | delete
8866 Value *LeftToFrom = Builder.CreateAnd(
8867 MapType,
8868 Builder.getInt64(
8869 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8870 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8871 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8872 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8873 BasicBlock *AllocElseBB =
8874 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8875 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8876 BasicBlock *ToElseBB =
8877 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8878 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8879 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8880 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8881 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8882 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8883 emitBlock(AllocBB, MapperFn);
8884 Value *AllocMapType = Builder.CreateAnd(
8885 MemberMapType,
8886 Builder.getInt64(
8887 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8888 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8889 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8890 Builder.CreateBr(EndBB);
8891 emitBlock(AllocElseBB, MapperFn);
8892 Value *IsTo = Builder.CreateICmpEQ(
8893 LeftToFrom,
8894 Builder.getInt64(
8895 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8896 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8897 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8898 // In case of to, clear OMP_MAP_FROM.
8899 emitBlock(ToBB, MapperFn);
8900 Value *ToMapType = Builder.CreateAnd(
8901 MemberMapType,
8902 Builder.getInt64(
8903 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8905 Builder.CreateBr(EndBB);
8906 emitBlock(ToElseBB, MapperFn);
8907 Value *IsFrom = Builder.CreateICmpEQ(
8908 LeftToFrom,
8909 Builder.getInt64(
8910 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8911 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8912 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8913 // In case of from, clear OMP_MAP_TO.
8914 emitBlock(FromBB, MapperFn);
8915 Value *FromMapType = Builder.CreateAnd(
8916 MemberMapType,
8917 Builder.getInt64(
8918 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8919 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8920 // In case of tofrom, do nothing.
8921 emitBlock(EndBB, MapperFn);
8922 LastBB = EndBB;
8923 PHINode *CurMapType =
8924 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8925 CurMapType->addIncoming(AllocMapType, AllocBB);
8926 CurMapType->addIncoming(ToMapType, ToBB);
8927 CurMapType->addIncoming(FromMapType, FromBB);
8928 CurMapType->addIncoming(MemberMapType, ToElseBB);
8929
8930 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8931 CurSizeArg, CurMapType, CurNameArg};
8932
8933 auto ChildMapperFn = CustomMapperCB(I);
8934 if (!ChildMapperFn)
8935 return ChildMapperFn.takeError();
8936 if (*ChildMapperFn) {
8937 // Call the corresponding mapper function.
8938 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8939 } else {
8940 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8941 // data structure.
8942 Builder.CreateCall(
8943 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8944 OffloadingArgs);
8945 }
8946 }
8947
8948 // Update the pointer to point to the next element that needs to be mapped,
8949 // and check whether we have mapped all elements.
8950 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8951 "omp.arraymap.next");
8952 PtrPHI->addIncoming(PtrNext, LastBB);
8953 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8954 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8955 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8956
8957 emitBlock(ExitBB, MapperFn);
8958 // Emit array deletion if this is an array section and \p MapType indicates
8959 // that deletion is required.
8960 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8961 MapType, MapName, ElementSize, DoneBB,
8962 /*IsInit=*/false);
8963
8964 // Emit the function exit block.
8965 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8966
8967 Builder.CreateRetVoid();
8968 Builder.restoreIP(SavedIP);
8969 return MapperFn;
8970}
8971
8972Error OpenMPIRBuilder::emitOffloadingArrays(
8973 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8974 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8975 bool IsNonContiguous,
8976 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8977
8978 // Reset the array information.
8979 Info.clearArrayInfo();
8980 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8981
8982 if (Info.NumberOfPtrs == 0)
8983 return Error::success();
8984
8985 Builder.restoreIP(AllocaIP);
8986 // Detect if we have any capture size requiring runtime evaluation of the
8987 // size so that a constant array could be eventually used.
8988 ArrayType *PointerArrayType =
8989 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8990
8991 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8992 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8993
8994 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8995 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8996 AllocaInst *MappersArray = Builder.CreateAlloca(
8997 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8998 Info.RTArgs.MappersArray = MappersArray;
8999
9000 // If we don't have any VLA types or other types that require runtime
9001 // evaluation, we can use a constant array for the map sizes, otherwise we
9002 // need to fill up the arrays as we do for the pointers.
9003 Type *Int64Ty = Builder.getInt64Ty();
9004 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9005 ConstantInt::get(Int64Ty, 0));
9006 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9007 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9008 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9009 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9010 if (IsNonContiguous &&
9011 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9012 CombinedInfo.Types[I] &
9013 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9014 ConstSizes[I] =
9015 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9016 else
9017 ConstSizes[I] = CI;
9018 continue;
9019 }
9020 }
9021 RuntimeSizes.set(I);
9022 }
9023
9024 if (RuntimeSizes.all()) {
9025 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9026 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9027 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9028 restoreIPandDebugLoc(Builder, CodeGenIP);
9029 } else {
9030 auto *SizesArrayInit = ConstantArray::get(
9031 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9032 std::string Name = createPlatformSpecificName({"offload_sizes"});
9033 auto *SizesArrayGbl =
9034 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9035 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9036 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9037
9038 if (!RuntimeSizes.any()) {
9039 Info.RTArgs.SizesArray = SizesArrayGbl;
9040 } else {
9041 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9042 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9043 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9044 AllocaInst *Buffer = Builder.CreateAlloca(
9045 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9046 Buffer->setAlignment(OffloadSizeAlign);
9047 restoreIPandDebugLoc(Builder, CodeGenIP);
9048 Builder.CreateMemCpy(
9049 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9050 SizesArrayGbl, OffloadSizeAlign,
9051 Builder.getIntN(
9052 IndexSize,
9053 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9054
9055 Info.RTArgs.SizesArray = Buffer;
9056 }
9057 restoreIPandDebugLoc(Builder, CodeGenIP);
9058 }
9059
9060 // The map types are always constant so we don't need to generate code to
9061 // fill arrays. Instead, we create an array constant.
9063 for (auto mapFlag : CombinedInfo.Types)
9064 Mapping.push_back(
9065 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9066 mapFlag));
9067 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9068 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9069 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9070
9071 // The information types are only built if provided.
9072 if (!CombinedInfo.Names.empty()) {
9073 auto *MapNamesArrayGbl = createOffloadMapnames(
9074 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9075 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9076 Info.EmitDebug = true;
9077 } else {
9078 Info.RTArgs.MapNamesArray =
9079 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9080 Info.EmitDebug = false;
9081 }
9082
9083 // If there's a present map type modifier, it must not be applied to the end
9084 // of a region, so generate a separate map type array in that case.
9085 if (Info.separateBeginEndCalls()) {
9086 bool EndMapTypesDiffer = false;
9087 for (uint64_t &Type : Mapping) {
9088 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9089 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9090 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9091 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9092 EndMapTypesDiffer = true;
9093 }
9094 }
9095 if (EndMapTypesDiffer) {
9096 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9097 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9098 }
9099 }
9100
9101 PointerType *PtrTy = Builder.getPtrTy();
9102 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9103 Value *BPVal = CombinedInfo.BasePointers[I];
9104 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9105 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9106 0, I);
9107 Builder.CreateAlignedStore(BPVal, BP,
9108 M.getDataLayout().getPrefTypeAlign(PtrTy));
9109
9110 if (Info.requiresDevicePointerInfo()) {
9111 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9112 CodeGenIP = Builder.saveIP();
9113 Builder.restoreIP(AllocaIP);
9114 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9115 Builder.restoreIP(CodeGenIP);
9116 if (DeviceAddrCB)
9117 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9118 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9119 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9120 if (DeviceAddrCB)
9121 DeviceAddrCB(I, BP);
9122 }
9123 }
9124
9125 Value *PVal = CombinedInfo.Pointers[I];
9126 Value *P = Builder.CreateConstInBoundsGEP2_32(
9127 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9128 I);
9129 // TODO: Check alignment correct.
9130 Builder.CreateAlignedStore(PVal, P,
9131 M.getDataLayout().getPrefTypeAlign(PtrTy));
9132
9133 if (RuntimeSizes.test(I)) {
9134 Value *S = Builder.CreateConstInBoundsGEP2_32(
9135 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9136 /*Idx0=*/0,
9137 /*Idx1=*/I);
9138 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9139 Int64Ty,
9140 /*isSigned=*/true),
9141 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9142 }
9143 // Fill up the mapper array.
9144 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9145 Value *MFunc = ConstantPointerNull::get(PtrTy);
9146
9147 auto CustomMFunc = CustomMapperCB(I);
9148 if (!CustomMFunc)
9149 return CustomMFunc.takeError();
9150 if (*CustomMFunc)
9151 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9152
9153 Value *MAddr = Builder.CreateInBoundsGEP(
9154 MappersArray->getAllocatedType(), MappersArray,
9155 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9156 Builder.CreateAlignedStore(
9157 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9158 }
9159
9160 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9161 Info.NumberOfPtrs == 0)
9162 return Error::success();
9163 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9164 return Error::success();
9165}
9166
9167void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9168 BasicBlock *CurBB = Builder.GetInsertBlock();
9169
9170 if (!CurBB || CurBB->getTerminator()) {
9171 // If there is no insert point or the previous block is already
9172 // terminated, don't touch it.
9173 } else {
9174 // Otherwise, create a fall-through branch.
9175 Builder.CreateBr(Target);
9176 }
9177
9178 Builder.ClearInsertionPoint();
9179}
9180
9181void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9182 bool IsFinished) {
9183 BasicBlock *CurBB = Builder.GetInsertBlock();
9184
9185 // Fall out of the current block (if necessary).
9186 emitBranch(BB);
9187
9188 if (IsFinished && BB->use_empty()) {
9189 BB->eraseFromParent();
9190 return;
9191 }
9192
9193 // Place the block after the current block, if possible, or else at
9194 // the end of the function.
9195 if (CurBB && CurBB->getParent())
9196 CurFn->insert(std::next(CurBB->getIterator()), BB);
9197 else
9198 CurFn->insert(CurFn->end(), BB);
9199 Builder.SetInsertPoint(BB);
9200}
9201
9202Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9203 BodyGenCallbackTy ElseGen,
9204 InsertPointTy AllocaIP) {
9205 // If the condition constant folds and can be elided, try to avoid emitting
9206 // the condition and the dead arm of the if/else.
9207 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9208 auto CondConstant = CI->getSExtValue();
9209 if (CondConstant)
9210 return ThenGen(AllocaIP, Builder.saveIP());
9211
9212 return ElseGen(AllocaIP, Builder.saveIP());
9213 }
9214
9215 Function *CurFn = Builder.GetInsertBlock()->getParent();
9216
9217 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9218 // emit the conditional branch.
9219 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9220 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9221 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9222 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9223 // Emit the 'then' code.
9224 emitBlock(ThenBlock, CurFn);
9225 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9226 return Err;
9227 emitBranch(ContBlock);
9228 // Emit the 'else' code if present.
9229 // There is no need to emit line number for unconditional branch.
9230 emitBlock(ElseBlock, CurFn);
9231 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9232 return Err;
9233 // There is no need to emit line number for unconditional branch.
9234 emitBranch(ContBlock);
9235 // Emit the continuation block for code after the if.
9236 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9237 return Error::success();
9238}
9239
9240bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9241 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9244 "Unexpected Atomic Ordering.");
9245
9246 bool Flush = false;
9248
9249 switch (AK) {
9250 case Read:
9253 FlushAO = AtomicOrdering::Acquire;
9254 Flush = true;
9255 }
9256 break;
9257 case Write:
9258 case Compare:
9259 case Update:
9262 FlushAO = AtomicOrdering::Release;
9263 Flush = true;
9264 }
9265 break;
9266 case Capture:
9267 switch (AO) {
9269 FlushAO = AtomicOrdering::Acquire;
9270 Flush = true;
9271 break;
9273 FlushAO = AtomicOrdering::Release;
9274 Flush = true;
9275 break;
9279 Flush = true;
9280 break;
9281 default:
9282 // do nothing - leave silently.
9283 break;
9284 }
9285 }
9286
9287 if (Flush) {
9288 // Currently Flush RT call still doesn't take memory_ordering, so for when
9289 // that happens, this tries to do the resolution of which atomic ordering
9290 // to use with but issue the flush call
9291 // TODO: pass `FlushAO` after memory ordering support is added
9292 (void)FlushAO;
9293 emitFlush(Loc);
9294 }
9295
9296 // for AO == AtomicOrdering::Monotonic and all other case combinations
9297 // do nothing
9298 return Flush;
9299}
9300
9301OpenMPIRBuilder::InsertPointTy
9302OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9303 AtomicOpValue &X, AtomicOpValue &V,
9304 AtomicOrdering AO, InsertPointTy AllocaIP) {
9305 if (!updateToLocation(Loc))
9306 return Loc.IP;
9307
9308 assert(X.Var->getType()->isPointerTy() &&
9309 "OMP Atomic expects a pointer to target memory");
9310 Type *XElemTy = X.ElemTy;
9311 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9312 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9313 "OMP atomic read expected a scalar type");
9314
9315 Value *XRead = nullptr;
9316
9317 if (XElemTy->isIntegerTy()) {
9318 LoadInst *XLD =
9319 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9320 XLD->setAtomic(AO);
9321 XRead = cast<Value>(XLD);
9322 } else if (XElemTy->isStructTy()) {
9323 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9324 // target does not support `atomicrmw` of the size of the struct
9325 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9326 OldVal->setAtomic(AO);
9327 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9328 unsigned LoadSize =
9329 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9330 OpenMPIRBuilder::AtomicInfo atomicInfo(
9331 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9332 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9333 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9334 XRead = AtomicLoadRes.first;
9335 OldVal->eraseFromParent();
9336 } else {
9337 // We need to perform atomic op as integer
9338 IntegerType *IntCastTy =
9339 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9340 LoadInst *XLoad =
9341 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9342 XLoad->setAtomic(AO);
9343 if (XElemTy->isFloatingPointTy()) {
9344 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9345 } else {
9346 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9347 }
9348 }
9349 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9350 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9351 return Builder.saveIP();
9352}
9353
9354OpenMPIRBuilder::InsertPointTy
9355OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9356 AtomicOpValue &X, Value *Expr,
9357 AtomicOrdering AO, InsertPointTy AllocaIP) {
9358 if (!updateToLocation(Loc))
9359 return Loc.IP;
9360
9361 assert(X.Var->getType()->isPointerTy() &&
9362 "OMP Atomic expects a pointer to target memory");
9363 Type *XElemTy = X.ElemTy;
9364 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9365 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9366 "OMP atomic write expected a scalar type");
9367
9368 if (XElemTy->isIntegerTy()) {
9369 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9370 XSt->setAtomic(AO);
9371 } else if (XElemTy->isStructTy()) {
9372 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9373 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9374 unsigned LoadSize =
9375 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9376 OpenMPIRBuilder::AtomicInfo atomicInfo(
9377 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9378 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9379 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9380 OldVal->eraseFromParent();
9381 } else {
9382 // We need to bitcast and perform atomic op as integers
9383 IntegerType *IntCastTy =
9384 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9385 Value *ExprCast =
9386 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9387 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9388 XSt->setAtomic(AO);
9389 }
9390
9391 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9392 return Builder.saveIP();
9393}
9394
9395OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9396 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9397 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9398 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9399 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9400 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9401 if (!updateToLocation(Loc))
9402 return Loc.IP;
9403
9404 LLVM_DEBUG({
9405 Type *XTy = X.Var->getType();
9406 assert(XTy->isPointerTy() &&
9407 "OMP Atomic expects a pointer to target memory");
9408 Type *XElemTy = X.ElemTy;
9409 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9410 XElemTy->isPointerTy()) &&
9411 "OMP atomic update expected a scalar type");
9412 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9413 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9414 "OpenMP atomic does not support LT or GT operations");
9415 });
9416
9417 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9418 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9419 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9420 if (!AtomicResult)
9421 return AtomicResult.takeError();
9422 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9423 return Builder.saveIP();
9424}
9425
9426// FIXME: Duplicating AtomicExpand
9427Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9428 AtomicRMWInst::BinOp RMWOp) {
9429 switch (RMWOp) {
9430 case AtomicRMWInst::Add:
9431 return Builder.CreateAdd(Src1, Src2);
9432 case AtomicRMWInst::Sub:
9433 return Builder.CreateSub(Src1, Src2);
9434 case AtomicRMWInst::And:
9435 return Builder.CreateAnd(Src1, Src2);
9437 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9438 case AtomicRMWInst::Or:
9439 return Builder.CreateOr(Src1, Src2);
9440 case AtomicRMWInst::Xor:
9441 return Builder.CreateXor(Src1, Src2);
9446 case AtomicRMWInst::Max:
9447 case AtomicRMWInst::Min:
9458 llvm_unreachable("Unsupported atomic update operation");
9459 }
9460 llvm_unreachable("Unsupported atomic update operation");
9461}
9462
9463Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9464 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9466 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9467 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9468 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9469 // or a complex datatype.
9470 bool emitRMWOp = false;
9471 switch (RMWOp) {
9472 case AtomicRMWInst::Add:
9473 case AtomicRMWInst::And:
9475 case AtomicRMWInst::Or:
9476 case AtomicRMWInst::Xor:
9478 emitRMWOp = XElemTy;
9479 break;
9480 case AtomicRMWInst::Sub:
9481 emitRMWOp = (IsXBinopExpr && XElemTy);
9482 break;
9483 default:
9484 emitRMWOp = false;
9485 }
9486 emitRMWOp &= XElemTy->isIntegerTy();
9487
9488 std::pair<Value *, Value *> Res;
9489 if (emitRMWOp) {
9490 AtomicRMWInst *RMWInst =
9491 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9492 if (T.isAMDGPU()) {
9493 if (IsIgnoreDenormalMode)
9494 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9495 llvm::MDNode::get(Builder.getContext(), {}));
9496 if (!IsFineGrainedMemory)
9497 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9498 llvm::MDNode::get(Builder.getContext(), {}));
9499 if (!IsRemoteMemory)
9500 RMWInst->setMetadata("amdgpu.no.remote.memory",
9501 llvm::MDNode::get(Builder.getContext(), {}));
9502 }
9503 Res.first = RMWInst;
9504 // not needed except in case of postfix captures. Generate anyway for
9505 // consistency with the else part. Will be removed with any DCE pass.
9506 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9507 if (RMWOp == AtomicRMWInst::Xchg)
9508 Res.second = Res.first;
9509 else
9510 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9511 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9512 XElemTy->isStructTy()) {
9513 LoadInst *OldVal =
9514 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9515 OldVal->setAtomic(AO);
9516 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9517 unsigned LoadSize =
9518 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9519
9520 OpenMPIRBuilder::AtomicInfo atomicInfo(
9521 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9522 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9523 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9524 BasicBlock *CurBB = Builder.GetInsertBlock();
9525 Instruction *CurBBTI = CurBB->getTerminator();
9526 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9527 BasicBlock *ExitBB =
9528 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9529 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9530 X->getName() + ".atomic.cont");
9531 ContBB->getTerminator()->eraseFromParent();
9532 Builder.restoreIP(AllocaIP);
9533 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9534 NewAtomicAddr->setName(X->getName() + "x.new.val");
9535 Builder.SetInsertPoint(ContBB);
9536 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9537 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9538 Value *OldExprVal = PHI;
9539 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9540 if (!CBResult)
9541 return CBResult.takeError();
9542 Value *Upd = *CBResult;
9543 Builder.CreateStore(Upd, NewAtomicAddr);
9546 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9547 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9548 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9549 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9550 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9551 OldVal->eraseFromParent();
9552 Res.first = OldExprVal;
9553 Res.second = Upd;
9554
9555 if (UnreachableInst *ExitTI =
9557 CurBBTI->eraseFromParent();
9558 Builder.SetInsertPoint(ExitBB);
9559 } else {
9560 Builder.SetInsertPoint(ExitTI);
9561 }
9562 } else {
9563 IntegerType *IntCastTy =
9564 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9565 LoadInst *OldVal =
9566 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9567 OldVal->setAtomic(AO);
9568 // CurBB
9569 // | /---\
9570 // ContBB |
9571 // | \---/
9572 // ExitBB
9573 BasicBlock *CurBB = Builder.GetInsertBlock();
9574 Instruction *CurBBTI = CurBB->getTerminator();
9575 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9576 BasicBlock *ExitBB =
9577 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9578 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9579 X->getName() + ".atomic.cont");
9580 ContBB->getTerminator()->eraseFromParent();
9581 Builder.restoreIP(AllocaIP);
9582 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9583 NewAtomicAddr->setName(X->getName() + "x.new.val");
9584 Builder.SetInsertPoint(ContBB);
9585 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9586 PHI->addIncoming(OldVal, CurBB);
9587 bool IsIntTy = XElemTy->isIntegerTy();
9588 Value *OldExprVal = PHI;
9589 if (!IsIntTy) {
9590 if (XElemTy->isFloatingPointTy()) {
9591 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9592 X->getName() + ".atomic.fltCast");
9593 } else {
9594 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9595 X->getName() + ".atomic.ptrCast");
9596 }
9597 }
9598
9599 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9600 if (!CBResult)
9601 return CBResult.takeError();
9602 Value *Upd = *CBResult;
9603 Builder.CreateStore(Upd, NewAtomicAddr);
9604 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9607 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9608 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9609 Result->setVolatile(VolatileX);
9610 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9611 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9612 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9613 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9614
9615 Res.first = OldExprVal;
9616 Res.second = Upd;
9617
9618 // set Insertion point in exit block
9619 if (UnreachableInst *ExitTI =
9621 CurBBTI->eraseFromParent();
9622 Builder.SetInsertPoint(ExitBB);
9623 } else {
9624 Builder.SetInsertPoint(ExitTI);
9625 }
9626 }
9627
9628 return Res;
9629}
9630
9631OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9632 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9633 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9634 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9635 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9636 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9637 if (!updateToLocation(Loc))
9638 return Loc.IP;
9639
9640 LLVM_DEBUG({
9641 Type *XTy = X.Var->getType();
9642 assert(XTy->isPointerTy() &&
9643 "OMP Atomic expects a pointer to target memory");
9644 Type *XElemTy = X.ElemTy;
9645 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9646 XElemTy->isPointerTy()) &&
9647 "OMP atomic capture expected a scalar type");
9648 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9649 "OpenMP atomic does not support LT or GT operations");
9650 });
9651
9652 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9653 // 'x' is simply atomically rewritten with 'expr'.
9654 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9655 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9656 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9657 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9658 if (!AtomicResult)
9659 return AtomicResult.takeError();
9660 Value *CapturedVal =
9661 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9662 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9663
9664 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9665 return Builder.saveIP();
9666}
9667
9668OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9669 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9670 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9671 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9672 bool IsFailOnly) {
9673
9675 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9676 IsPostfixUpdate, IsFailOnly, Failure);
9677}
9678
9679OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9680 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9681 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9682 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9683 bool IsFailOnly, AtomicOrdering Failure) {
9684
9685 if (!updateToLocation(Loc))
9686 return Loc.IP;
9687
9688 assert(X.Var->getType()->isPointerTy() &&
9689 "OMP atomic expects a pointer to target memory");
9690 // compare capture
9691 if (V.Var) {
9692 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9693 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9694 }
9695
9696 bool IsInteger = E->getType()->isIntegerTy();
9697
9698 if (Op == OMPAtomicCompareOp::EQ) {
9699 AtomicCmpXchgInst *Result = nullptr;
9700 if (!IsInteger) {
9701 IntegerType *IntCastTy =
9702 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9703 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9704 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9705 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9706 AO, Failure);
9707 } else {
9708 Result =
9709 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9710 }
9711
9712 if (V.Var) {
9713 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9714 if (!IsInteger)
9715 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9716 assert(OldValue->getType() == V.ElemTy &&
9717 "OldValue and V must be of same type");
9718 if (IsPostfixUpdate) {
9719 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9720 } else {
9721 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9722 if (IsFailOnly) {
9723 // CurBB----
9724 // | |
9725 // v |
9726 // ContBB |
9727 // | |
9728 // v |
9729 // ExitBB <-
9730 //
9731 // where ContBB only contains the store of old value to 'v'.
9732 BasicBlock *CurBB = Builder.GetInsertBlock();
9733 Instruction *CurBBTI = CurBB->getTerminator();
9734 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9735 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9736 CurBBTI, X.Var->getName() + ".atomic.exit");
9737 BasicBlock *ContBB = CurBB->splitBasicBlock(
9738 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9739 ContBB->getTerminator()->eraseFromParent();
9740 CurBB->getTerminator()->eraseFromParent();
9741
9742 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9743
9744 Builder.SetInsertPoint(ContBB);
9745 Builder.CreateStore(OldValue, V.Var);
9746 Builder.CreateBr(ExitBB);
9747
9748 if (UnreachableInst *ExitTI =
9750 CurBBTI->eraseFromParent();
9751 Builder.SetInsertPoint(ExitBB);
9752 } else {
9753 Builder.SetInsertPoint(ExitTI);
9754 }
9755 } else {
9756 Value *CapturedValue =
9757 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9758 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9759 }
9760 }
9761 }
9762 // The comparison result has to be stored.
9763 if (R.Var) {
9764 assert(R.Var->getType()->isPointerTy() &&
9765 "r.var must be of pointer type");
9766 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9767
9768 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9769 Value *ResultCast = R.IsSigned
9770 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9771 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9772 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9773 }
9774 } else {
9775 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9776 "Op should be either max or min at this point");
9777 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9778
9779 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9780 // Let's take max as example.
9781 // OpenMP form:
9782 // x = x > expr ? expr : x;
9783 // LLVM form:
9784 // *ptr = *ptr > val ? *ptr : val;
9785 // We need to transform to LLVM form.
9786 // x = x <= expr ? x : expr;
9788 if (IsXBinopExpr) {
9789 if (IsInteger) {
9790 if (X.IsSigned)
9791 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9793 else
9794 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9796 } else {
9797 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9799 }
9800 } else {
9801 if (IsInteger) {
9802 if (X.IsSigned)
9803 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9805 else
9806 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9808 } else {
9809 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9811 }
9812 }
9813
9814 AtomicRMWInst *OldValue =
9815 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9816 if (V.Var) {
9817 Value *CapturedValue = nullptr;
9818 if (IsPostfixUpdate) {
9819 CapturedValue = OldValue;
9820 } else {
9821 CmpInst::Predicate Pred;
9822 switch (NewOp) {
9823 case AtomicRMWInst::Max:
9824 Pred = CmpInst::ICMP_SGT;
9825 break;
9827 Pred = CmpInst::ICMP_UGT;
9828 break;
9830 Pred = CmpInst::FCMP_OGT;
9831 break;
9832 case AtomicRMWInst::Min:
9833 Pred = CmpInst::ICMP_SLT;
9834 break;
9836 Pred = CmpInst::ICMP_ULT;
9837 break;
9839 Pred = CmpInst::FCMP_OLT;
9840 break;
9841 default:
9842 llvm_unreachable("unexpected comparison op");
9843 }
9844 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9845 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9846 }
9847 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9848 }
9849 }
9850
9851 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9852
9853 return Builder.saveIP();
9854}
9855
9856OpenMPIRBuilder::InsertPointOrErrorTy
9857OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9858 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9859 Value *NumTeamsUpper, Value *ThreadLimit,
9860 Value *IfExpr) {
9861 if (!updateToLocation(Loc))
9862 return InsertPointTy();
9863
9864 uint32_t SrcLocStrSize;
9865 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9866 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9867 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9868
9869 // Outer allocation basicblock is the entry block of the current function.
9870 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9871 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9872 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9873 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9874 }
9875
9876 // The current basic block is split into four basic blocks. After outlining,
9877 // they will be mapped as follows:
9878 // ```
9879 // def current_fn() {
9880 // current_basic_block:
9881 // br label %teams.exit
9882 // teams.exit:
9883 // ; instructions after teams
9884 // }
9885 //
9886 // def outlined_fn() {
9887 // teams.alloca:
9888 // br label %teams.body
9889 // teams.body:
9890 // ; instructions within teams body
9891 // }
9892 // ```
9893 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9894 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9895 BasicBlock *AllocaBB =
9896 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9897
9898 bool SubClausesPresent =
9899 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9900 // Push num_teams
9901 if (!Config.isTargetDevice() && SubClausesPresent) {
9902 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9903 "if lowerbound is non-null, then upperbound must also be non-null "
9904 "for bounds on num_teams");
9905
9906 if (NumTeamsUpper == nullptr)
9907 NumTeamsUpper = Builder.getInt32(0);
9908
9909 if (NumTeamsLower == nullptr)
9910 NumTeamsLower = NumTeamsUpper;
9911
9912 if (IfExpr) {
9913 assert(IfExpr->getType()->isIntegerTy() &&
9914 "argument to if clause must be an integer value");
9915
9916 // upper = ifexpr ? upper : 1
9917 if (IfExpr->getType() != Int1)
9918 IfExpr = Builder.CreateICmpNE(IfExpr,
9919 ConstantInt::get(IfExpr->getType(), 0));
9920 NumTeamsUpper = Builder.CreateSelect(
9921 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9922
9923 // lower = ifexpr ? lower : 1
9924 NumTeamsLower = Builder.CreateSelect(
9925 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9926 }
9927
9928 if (ThreadLimit == nullptr)
9929 ThreadLimit = Builder.getInt32(0);
9930
9931 Value *ThreadNum = getOrCreateThreadID(Ident);
9932 Builder.CreateCall(
9933 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9934 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9935 }
9936 // Generate the body of teams.
9937 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9938 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9939 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9940 return Err;
9941
9942 OutlineInfo OI;
9943 OI.EntryBB = AllocaBB;
9944 OI.ExitBB = ExitBB;
9945 OI.OuterAllocaBB = &OuterAllocaBB;
9946
9947 // Insert fake values for global tid and bound tid.
9949 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9950 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9951 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9952 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9953 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9954
9955 auto HostPostOutlineCB = [this, Ident,
9956 ToBeDeleted](Function &OutlinedFn) mutable {
9957 // The stale call instruction will be replaced with a new call instruction
9958 // for runtime call with the outlined function.
9959
9960 assert(OutlinedFn.hasOneUse() &&
9961 "there must be a single user for the outlined function");
9962 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9963 ToBeDeleted.push_back(StaleCI);
9964
9965 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9966 "Outlined function must have two or three arguments only");
9967
9968 bool HasShared = OutlinedFn.arg_size() == 3;
9969
9970 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9971 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9972 if (HasShared)
9973 OutlinedFn.getArg(2)->setName("data");
9974
9975 // Call to the runtime function for teams in the current function.
9976 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9977 "outlined function.");
9978 Builder.SetInsertPoint(StaleCI);
9980 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9981 if (HasShared)
9982 Args.push_back(StaleCI->getArgOperand(2));
9983 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9984 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9985 Args);
9986
9987 for (Instruction *I : llvm::reverse(ToBeDeleted))
9988 I->eraseFromParent();
9989 };
9990
9991 if (!Config.isTargetDevice())
9992 OI.PostOutlineCB = HostPostOutlineCB;
9993
9994 addOutlineInfo(std::move(OI));
9995
9996 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9997
9998 return Builder.saveIP();
9999}
10000
10001OpenMPIRBuilder::InsertPointOrErrorTy
10002OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10003 InsertPointTy OuterAllocaIP,
10004 BodyGenCallbackTy BodyGenCB) {
10005 if (!updateToLocation(Loc))
10006 return InsertPointTy();
10007
10008 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10009
10010 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10011 BasicBlock *BodyBB =
10012 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10013 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10014 }
10015 BasicBlock *ExitBB =
10016 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10017 BasicBlock *BodyBB =
10018 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10019 BasicBlock *AllocaBB =
10020 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10021
10022 // Generate the body of distribute clause
10023 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10024 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10025 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10026 return Err;
10027
10028 OutlineInfo OI;
10029 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10030 OI.EntryBB = AllocaBB;
10031 OI.ExitBB = ExitBB;
10032
10033 addOutlineInfo(std::move(OI));
10034 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10035
10036 return Builder.saveIP();
10037}
10038
10040OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10041 std::string VarName) {
10042 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10044 Names.size()),
10045 Names);
10046 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10047 M, MapNamesArrayInit->getType(),
10048 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10049 VarName);
10050 return MapNamesArrayGlobal;
10051}
10052
10053// Create all simple and struct types exposed by the runtime and remember
10054// the llvm::PointerTypes of them for easy access later.
10055void OpenMPIRBuilder::initializeTypes(Module &M) {
10056 LLVMContext &Ctx = M.getContext();
10057 StructType *T;
10058#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10059#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10060 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10061 VarName##PtrTy = PointerType::getUnqual(Ctx);
10062#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10063 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10064 VarName##Ptr = PointerType::getUnqual(Ctx);
10065#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10066 T = StructType::getTypeByName(Ctx, StructName); \
10067 if (!T) \
10068 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10069 VarName = T; \
10070 VarName##Ptr = PointerType::getUnqual(Ctx);
10071#include "llvm/Frontend/OpenMP/OMPKinds.def"
10072}
10073
10074void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10076 SmallVectorImpl<BasicBlock *> &BlockVector) {
10078 BlockSet.insert(EntryBB);
10079 BlockSet.insert(ExitBB);
10080
10081 Worklist.push_back(EntryBB);
10082 while (!Worklist.empty()) {
10083 BasicBlock *BB = Worklist.pop_back_val();
10084 BlockVector.push_back(BB);
10085 for (BasicBlock *SuccBB : successors(BB))
10086 if (BlockSet.insert(SuccBB).second)
10087 Worklist.push_back(SuccBB);
10088 }
10089}
10090
10091void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10092 uint64_t Size, int32_t Flags,
10094 StringRef Name) {
10095 if (!Config.isGPU()) {
10098 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10099 return;
10100 }
10101 // TODO: Add support for global variables on the device after declare target
10102 // support.
10103 Function *Fn = dyn_cast<Function>(Addr);
10104 if (!Fn)
10105 return;
10106
10107 // Add a function attribute for the kernel.
10108 Fn->addFnAttr("kernel");
10109 if (T.isAMDGCN())
10110 Fn->addFnAttr("uniform-work-group-size", "true");
10111 Fn->addFnAttr(Attribute::MustProgress);
10112}
10113
10114// We only generate metadata for function that contain target regions.
10115void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10116 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10117
10118 // If there are no entries, we don't need to do anything.
10119 if (OffloadInfoManager.empty())
10120 return;
10121
10122 LLVMContext &C = M.getContext();
10123 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10124 TargetRegionEntryInfo>,
10125 16>
10126 OrderedEntries(OffloadInfoManager.size());
10127
10128 // Auxiliary methods to create metadata values and strings.
10129 auto &&GetMDInt = [this](unsigned V) {
10130 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10131 };
10132
10133 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10134
10135 // Create the offloading info metadata node.
10136 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10137 auto &&TargetRegionMetadataEmitter =
10138 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10139 const TargetRegionEntryInfo &EntryInfo,
10140 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10141 // Generate metadata for target regions. Each entry of this metadata
10142 // contains:
10143 // - Entry 0 -> Kind of this type of metadata (0).
10144 // - Entry 1 -> Device ID of the file where the entry was identified.
10145 // - Entry 2 -> File ID of the file where the entry was identified.
10146 // - Entry 3 -> Mangled name of the function where the entry was
10147 // identified.
10148 // - Entry 4 -> Line in the file where the entry was identified.
10149 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10150 // - Entry 6 -> Order the entry was created.
10151 // The first element of the metadata node is the kind.
10152 Metadata *Ops[] = {
10153 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10154 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10155 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10156 GetMDInt(E.getOrder())};
10157
10158 // Save this entry in the right position of the ordered entries array.
10159 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10160
10161 // Add metadata to the named metadata node.
10162 MD->addOperand(MDNode::get(C, Ops));
10163 };
10164
10165 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10166
10167 // Create function that emits metadata for each device global variable entry;
10168 auto &&DeviceGlobalVarMetadataEmitter =
10169 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10170 StringRef MangledName,
10171 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10172 // Generate metadata for global variables. Each entry of this metadata
10173 // contains:
10174 // - Entry 0 -> Kind of this type of metadata (1).
10175 // - Entry 1 -> Mangled name of the variable.
10176 // - Entry 2 -> Declare target kind.
10177 // - Entry 3 -> Order the entry was created.
10178 // The first element of the metadata node is the kind.
10179 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10180 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10181
10182 // Save this entry in the right position of the ordered entries array.
10183 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10184 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10185
10186 // Add metadata to the named metadata node.
10187 MD->addOperand(MDNode::get(C, Ops));
10188 };
10189
10190 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10191 DeviceGlobalVarMetadataEmitter);
10192
10193 for (const auto &E : OrderedEntries) {
10194 assert(E.first && "All ordered entries must exist!");
10195 if (const auto *CE =
10197 E.first)) {
10198 if (!CE->getID() || !CE->getAddress()) {
10199 // Do not blame the entry if the parent funtion is not emitted.
10200 TargetRegionEntryInfo EntryInfo = E.second;
10201 StringRef FnName = EntryInfo.ParentName;
10202 if (!M.getNamedValue(FnName))
10203 continue;
10204 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10205 continue;
10206 }
10207 createOffloadEntry(CE->getID(), CE->getAddress(),
10208 /*Size=*/0, CE->getFlags(),
10210 } else if (const auto *CE = dyn_cast<
10211 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10212 E.first)) {
10213 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10214 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10215 CE->getFlags());
10216 switch (Flags) {
10217 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10218 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10219 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10220 continue;
10221 if (!CE->getAddress()) {
10222 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10223 continue;
10224 }
10225 // The vaiable has no definition - no need to add the entry.
10226 if (CE->getVarSize() == 0)
10227 continue;
10228 break;
10229 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10230 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10231 (!Config.isTargetDevice() && CE->getAddress())) &&
10232 "Declaret target link address is set.");
10233 if (Config.isTargetDevice())
10234 continue;
10235 if (!CE->getAddress()) {
10236 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10237 continue;
10238 }
10239 break;
10240 default:
10241 break;
10242 }
10243
10244 // Hidden or internal symbols on the device are not externally visible.
10245 // We should not attempt to register them by creating an offloading
10246 // entry. Indirect variables are handled separately on the device.
10247 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10248 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10249 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10250 continue;
10251
10252 // Indirect globals need to use a special name that doesn't match the name
10253 // of the associated host global.
10254 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10255 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10256 Flags, CE->getLinkage(), CE->getVarName());
10257 else
10258 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10259 Flags, CE->getLinkage());
10260
10261 } else {
10262 llvm_unreachable("Unsupported entry kind.");
10263 }
10264 }
10265
10266 // Emit requires directive globals to a special entry so the runtime can
10267 // register them when the device image is loaded.
10268 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10269 // entries should be redesigned to better suit this use-case.
10270 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10274 ".requires", /*Size=*/0,
10275 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10276 Config.getRequiresFlags());
10277}
10278
10279void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10280 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10281 unsigned FileID, unsigned Line, unsigned Count) {
10282 raw_svector_ostream OS(Name);
10283 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10284 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10285 if (Count)
10286 OS << "_" << Count;
10287}
10288
10289void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10290 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10291 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10292 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10293 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10294 EntryInfo.Line, NewCount);
10295}
10296
10297TargetRegionEntryInfo
10298OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10299 StringRef ParentName) {
10300 sys::fs::UniqueID ID(0xdeadf17e, 0);
10301 auto FileIDInfo = CallBack();
10302 uint64_t FileID = 0;
10303 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10304 // If the inode ID could not be determined, create a hash value
10305 // the current file name and use that as an ID.
10306 if (EC)
10307 FileID = hash_value(std::get<0>(FileIDInfo));
10308 else
10309 FileID = ID.getFile();
10310
10311 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10312 std::get<1>(FileIDInfo));
10313}
10314
10315unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10316 unsigned Offset = 0;
10317 for (uint64_t Remain =
10318 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10320 !(Remain & 1); Remain = Remain >> 1)
10321 Offset++;
10322 return Offset;
10323}
10324
10326OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10327 // Rotate by getFlagMemberOffset() bits.
10328 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10329 << getFlagMemberOffset());
10330}
10331
10332void OpenMPIRBuilder::setCorrectMemberOfFlag(
10334 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10335 // If the entry is PTR_AND_OBJ but has not been marked with the special
10336 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10337 // marked as MEMBER_OF.
10338 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10340 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10343 return;
10344
10345 // Reset the placeholder value to prepare the flag for the assignment of the
10346 // proper MEMBER_OF value.
10347 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10348 Flags |= MemberOfFlag;
10349}
10350
10351Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10352 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10353 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10354 bool IsDeclaration, bool IsExternallyVisible,
10355 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10356 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10357 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10358 std::function<Constant *()> GlobalInitializer,
10359 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10360 // TODO: convert this to utilise the IRBuilder Config rather than
10361 // a passed down argument.
10362 if (OpenMPSIMD)
10363 return nullptr;
10364
10365 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10366 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10367 CaptureClause ==
10368 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10369 Config.hasRequiresUnifiedSharedMemory())) {
10370 SmallString<64> PtrName;
10371 {
10372 raw_svector_ostream OS(PtrName);
10373 OS << MangledName;
10374 if (!IsExternallyVisible)
10375 OS << format("_%x", EntryInfo.FileID);
10376 OS << "_decl_tgt_ref_ptr";
10377 }
10378
10379 Value *Ptr = M.getNamedValue(PtrName);
10380
10381 if (!Ptr) {
10382 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10383 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10384
10385 auto *GV = cast<GlobalVariable>(Ptr);
10386 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10387
10388 if (!Config.isTargetDevice()) {
10389 if (GlobalInitializer)
10390 GV->setInitializer(GlobalInitializer());
10391 else
10392 GV->setInitializer(GlobalValue);
10393 }
10394
10395 registerTargetGlobalVariable(
10396 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10397 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10398 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10399 }
10400
10401 return cast<Constant>(Ptr);
10402 }
10403
10404 return nullptr;
10405}
10406
10407void OpenMPIRBuilder::registerTargetGlobalVariable(
10408 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10409 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10410 bool IsDeclaration, bool IsExternallyVisible,
10411 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10412 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10413 std::vector<Triple> TargetTriple,
10414 std::function<Constant *()> GlobalInitializer,
10415 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10416 Constant *Addr) {
10417 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10418 (TargetTriple.empty() && !Config.isTargetDevice()))
10419 return;
10420
10421 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10423 int64_t VarSize;
10425
10426 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10427 CaptureClause ==
10428 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10429 !Config.hasRequiresUnifiedSharedMemory()) {
10430 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10431 VarName = MangledName;
10432 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10433
10434 if (!IsDeclaration)
10435 VarSize = divideCeil(
10436 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10437 else
10438 VarSize = 0;
10439 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10440
10441 // This is a workaround carried over from Clang which prevents undesired
10442 // optimisation of internal variables.
10443 if (Config.isTargetDevice() &&
10444 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10445 // Do not create a "ref-variable" if the original is not also available
10446 // on the host.
10447 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10448 return;
10449
10450 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10451
10452 if (!M.getNamedValue(RefName)) {
10453 Constant *AddrRef =
10454 getOrCreateInternalVariable(Addr->getType(), RefName);
10455 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10456 GvAddrRef->setConstant(true);
10457 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10458 GvAddrRef->setInitializer(Addr);
10459 GeneratedRefs.push_back(GvAddrRef);
10460 }
10461 }
10462 } else {
10463 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10464 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10465 else
10466 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10467
10468 if (Config.isTargetDevice()) {
10469 VarName = (Addr) ? Addr->getName() : "";
10470 Addr = nullptr;
10471 } else {
10472 Addr = getAddrOfDeclareTargetVar(
10473 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10474 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10475 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10476 VarName = (Addr) ? Addr->getName() : "";
10477 }
10478 VarSize = M.getDataLayout().getPointerSize();
10480 }
10481
10482 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10483 Flags, Linkage);
10484}
10485
10486/// Loads all the offload entries information from the host IR
10487/// metadata.
10488void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10489 // If we are in target mode, load the metadata from the host IR. This code has
10490 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10491
10492 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10493 if (!MD)
10494 return;
10495
10496 for (MDNode *MN : MD->operands()) {
10497 auto &&GetMDInt = [MN](unsigned Idx) {
10498 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10499 return cast<ConstantInt>(V->getValue())->getZExtValue();
10500 };
10501
10502 auto &&GetMDString = [MN](unsigned Idx) {
10503 auto *V = cast<MDString>(MN->getOperand(Idx));
10504 return V->getString();
10505 };
10506
10507 switch (GetMDInt(0)) {
10508 default:
10509 llvm_unreachable("Unexpected metadata!");
10510 break;
10511 case OffloadEntriesInfoManager::OffloadEntryInfo::
10512 OffloadingEntryInfoTargetRegion: {
10513 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10514 /*DeviceID=*/GetMDInt(1),
10515 /*FileID=*/GetMDInt(2),
10516 /*Line=*/GetMDInt(4),
10517 /*Count=*/GetMDInt(5));
10518 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10519 /*Order=*/GetMDInt(6));
10520 break;
10521 }
10522 case OffloadEntriesInfoManager::OffloadEntryInfo::
10523 OffloadingEntryInfoDeviceGlobalVar:
10524 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10525 /*MangledName=*/GetMDString(1),
10526 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10527 /*Flags=*/GetMDInt(2)),
10528 /*Order=*/GetMDInt(3));
10529 break;
10530 }
10531 }
10532}
10533
10534void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) {
10535 if (HostFilePath.empty())
10536 return;
10537
10538 auto Buf = MemoryBuffer::getFile(HostFilePath);
10539 if (std::error_code Err = Buf.getError()) {
10540 report_fatal_error(("error opening host file from host file path inside of "
10541 "OpenMPIRBuilder: " +
10542 Err.message())
10543 .c_str());
10544 }
10545
10546 LLVMContext Ctx;
10548 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10549 if (std::error_code Err = M.getError()) {
10551 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10552 .c_str());
10553 }
10554
10555 loadOffloadInfoMetadata(*M.get());
10556}
10557
10558//===----------------------------------------------------------------------===//
10559// OffloadEntriesInfoManager
10560//===----------------------------------------------------------------------===//
10561
10562bool OffloadEntriesInfoManager::empty() const {
10563 return OffloadEntriesTargetRegion.empty() &&
10564 OffloadEntriesDeviceGlobalVar.empty();
10565}
10566
10567unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10568 const TargetRegionEntryInfo &EntryInfo) const {
10569 auto It = OffloadEntriesTargetRegionCount.find(
10570 getTargetRegionEntryCountKey(EntryInfo));
10571 if (It == OffloadEntriesTargetRegionCount.end())
10572 return 0;
10573 return It->second;
10574}
10575
10576void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10577 const TargetRegionEntryInfo &EntryInfo) {
10578 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10579 EntryInfo.Count + 1;
10580}
10581
10582/// Initialize target region entry.
10583void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10584 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10585 OffloadEntriesTargetRegion[EntryInfo] =
10586 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10587 OMPTargetRegionEntryTargetRegion);
10588 ++OffloadingEntriesNum;
10589}
10590
10591void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10592 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10593 OMPTargetRegionEntryKind Flags) {
10594 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10595
10596 // Update the EntryInfo with the next available count for this location.
10597 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10598
10599 // If we are emitting code for a target, the entry is already initialized,
10600 // only has to be registered.
10601 if (OMPBuilder->Config.isTargetDevice()) {
10602 // This could happen if the device compilation is invoked standalone.
10603 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10604 return;
10605 }
10606 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10607 Entry.setAddress(Addr);
10608 Entry.setID(ID);
10609 Entry.setFlags(Flags);
10610 } else {
10611 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10612 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10613 return;
10614 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10615 "Target region entry already registered!");
10616 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10617 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10618 ++OffloadingEntriesNum;
10619 }
10620 incrementTargetRegionEntryInfoCount(EntryInfo);
10621}
10622
10623bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10624 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10625
10626 // Update the EntryInfo with the next available count for this location.
10627 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10628
10629 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10630 if (It == OffloadEntriesTargetRegion.end()) {
10631 return false;
10632 }
10633 // Fail if this entry is already registered.
10634 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10635 return false;
10636 return true;
10637}
10638
10639void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10640 const OffloadTargetRegionEntryInfoActTy &Action) {
10641 // Scan all target region entries and perform the provided action.
10642 for (const auto &It : OffloadEntriesTargetRegion) {
10643 Action(It.first, It.second);
10644 }
10645}
10646
10647void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10648 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10649 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10650 ++OffloadingEntriesNum;
10651}
10652
10653void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10654 StringRef VarName, Constant *Addr, int64_t VarSize,
10655 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10656 if (OMPBuilder->Config.isTargetDevice()) {
10657 // This could happen if the device compilation is invoked standalone.
10658 if (!hasDeviceGlobalVarEntryInfo(VarName))
10659 return;
10660 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10661 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10662 if (Entry.getVarSize() == 0) {
10663 Entry.setVarSize(VarSize);
10664 Entry.setLinkage(Linkage);
10665 }
10666 return;
10667 }
10668 Entry.setVarSize(VarSize);
10669 Entry.setLinkage(Linkage);
10670 Entry.setAddress(Addr);
10671 } else {
10672 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10673 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10674 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10675 "Entry not initialized!");
10676 if (Entry.getVarSize() == 0) {
10677 Entry.setVarSize(VarSize);
10678 Entry.setLinkage(Linkage);
10679 }
10680 return;
10681 }
10682 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10683 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10684 Addr, VarSize, Flags, Linkage,
10685 VarName.str());
10686 else
10687 OffloadEntriesDeviceGlobalVar.try_emplace(
10688 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10689 ++OffloadingEntriesNum;
10690 }
10691}
10692
10693void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10694 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10695 // Scan all target region entries and perform the provided action.
10696 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10697 Action(E.getKey(), E.getValue());
10698}
10699
10700//===----------------------------------------------------------------------===//
10701// CanonicalLoopInfo
10702//===----------------------------------------------------------------------===//
10703
10704void CanonicalLoopInfo::collectControlBlocks(
10706 // We only count those BBs as control block for which we do not need to
10707 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10708 // flow. For consistency, this also means we do not add the Body block, which
10709 // is just the entry to the body code.
10710 BBs.reserve(BBs.size() + 6);
10711 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10712}
10713
10714BasicBlock *CanonicalLoopInfo::getPreheader() const {
10715 assert(isValid() && "Requires a valid canonical loop");
10716 for (BasicBlock *Pred : predecessors(Header)) {
10717 if (Pred != Latch)
10718 return Pred;
10719 }
10720 llvm_unreachable("Missing preheader");
10721}
10722
10723void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10724 assert(isValid() && "Requires a valid canonical loop");
10725
10726 Instruction *CmpI = &getCond()->front();
10727 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10728 CmpI->setOperand(1, TripCount);
10729
10730#ifndef NDEBUG
10731 assertOK();
10732#endif
10733}
10734
10735void CanonicalLoopInfo::mapIndVar(
10736 llvm::function_ref<Value *(Instruction *)> Updater) {
10737 assert(isValid() && "Requires a valid canonical loop");
10738
10739 Instruction *OldIV = getIndVar();
10740
10741 // Record all uses excluding those introduced by the updater. Uses by the
10742 // CanonicalLoopInfo itself to keep track of the number of iterations are
10743 // excluded.
10744 SmallVector<Use *> ReplacableUses;
10745 for (Use &U : OldIV->uses()) {
10746 auto *User = dyn_cast<Instruction>(U.getUser());
10747 if (!User)
10748 continue;
10749 if (User->getParent() == getCond())
10750 continue;
10751 if (User->getParent() == getLatch())
10752 continue;
10753 ReplacableUses.push_back(&U);
10754 }
10755
10756 // Run the updater that may introduce new uses
10757 Value *NewIV = Updater(OldIV);
10758
10759 // Replace the old uses with the value returned by the updater.
10760 for (Use *U : ReplacableUses)
10761 U->set(NewIV);
10762
10763#ifndef NDEBUG
10764 assertOK();
10765#endif
10766}
10767
10768void CanonicalLoopInfo::assertOK() const {
10769#ifndef NDEBUG
10770 // No constraints if this object currently does not describe a loop.
10771 if (!isValid())
10772 return;
10773
10774 BasicBlock *Preheader = getPreheader();
10775 BasicBlock *Body = getBody();
10776 BasicBlock *After = getAfter();
10777
10778 // Verify standard control-flow we use for OpenMP loops.
10779 assert(Preheader);
10780 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10781 "Preheader must terminate with unconditional branch");
10782 assert(Preheader->getSingleSuccessor() == Header &&
10783 "Preheader must jump to header");
10784
10785 assert(Header);
10786 assert(isa<BranchInst>(Header->getTerminator()) &&
10787 "Header must terminate with unconditional branch");
10788 assert(Header->getSingleSuccessor() == Cond &&
10789 "Header must jump to exiting block");
10790
10791 assert(Cond);
10792 assert(Cond->getSinglePredecessor() == Header &&
10793 "Exiting block only reachable from header");
10794
10795 assert(isa<BranchInst>(Cond->getTerminator()) &&
10796 "Exiting block must terminate with conditional branch");
10797 assert(size(successors(Cond)) == 2 &&
10798 "Exiting block must have two successors");
10799 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10800 "Exiting block's first successor jump to the body");
10801 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10802 "Exiting block's second successor must exit the loop");
10803
10804 assert(Body);
10805 assert(Body->getSinglePredecessor() == Cond &&
10806 "Body only reachable from exiting block");
10807 assert(!isa<PHINode>(Body->front()));
10808
10809 assert(Latch);
10811 "Latch must terminate with unconditional branch");
10812 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10813 // TODO: To support simple redirecting of the end of the body code that has
10814 // multiple; introduce another auxiliary basic block like preheader and after.
10815 assert(Latch->getSinglePredecessor() != nullptr);
10816 assert(!isa<PHINode>(Latch->front()));
10817
10818 assert(Exit);
10819 assert(isa<BranchInst>(Exit->getTerminator()) &&
10820 "Exit block must terminate with unconditional branch");
10821 assert(Exit->getSingleSuccessor() == After &&
10822 "Exit block must jump to after block");
10823
10824 assert(After);
10825 assert(After->getSinglePredecessor() == Exit &&
10826 "After block only reachable from exit block");
10827 assert(After->empty() || !isa<PHINode>(After->front()));
10828
10829 Instruction *IndVar = getIndVar();
10830 assert(IndVar && "Canonical induction variable not found?");
10831 assert(isa<IntegerType>(IndVar->getType()) &&
10832 "Induction variable must be an integer");
10833 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10834 "Induction variable must be a PHI in the loop header");
10835 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10836 assert(
10837 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10838 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10839
10840 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10841 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10842 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10843 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10844 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10845 ->isOne());
10846
10847 Value *TripCount = getTripCount();
10848 assert(TripCount && "Loop trip count not found?");
10849 assert(IndVar->getType() == TripCount->getType() &&
10850 "Trip count and induction variable must have the same type");
10851
10852 auto *CmpI = cast<CmpInst>(&Cond->front());
10853 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10854 "Exit condition must be a signed less-than comparison");
10855 assert(CmpI->getOperand(0) == IndVar &&
10856 "Exit condition must compare the induction variable");
10857 assert(CmpI->getOperand(1) == TripCount &&
10858 "Exit condition must compare with the trip count");
10859#endif
10860}
10861
10862void CanonicalLoopInfo::invalidate() {
10863 Header = nullptr;
10864 Cond = nullptr;
10865 Latch = nullptr;
10866 Exit = nullptr;
10867}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:546
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:468
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:511
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:570
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:981
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1573
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1443
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1753
iterator_range< op_iterator > operands()
Definition Metadata.h:1849
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:247
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:233
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:461
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:626
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1112
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
A raw_ostream that writes to an SmallVector or SmallString.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:843
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1685
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:853
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:294
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...