LLVM 21.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78// This default unroll factor is based on microbenchmarks on gfx1030.
80 "amdgpu-memcpy-loop-unroll",
81 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
82 "operations when lowering memcpy as a loop"),
83 cl::init(16), cl::Hidden);
84
85static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
86 unsigned Depth = 0) {
87 const Instruction *I = dyn_cast<Instruction>(Cond);
88 if (!I)
89 return false;
90
91 for (const Value *V : I->operand_values()) {
92 if (!L->contains(I))
93 continue;
94 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
95 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
96 return SubLoop->contains(PHI); }))
97 return true;
98 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
99 return true;
100 }
101 return false;
102}
103
105 : BaseT(TM, F.getDataLayout()),
106 TargetTriple(TM->getTargetTriple()),
107 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
108 TLI(ST->getTargetLowering()) {}
109
113 const Function &F = *L->getHeader()->getParent();
114 UP.Threshold =
115 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
116 UP.MaxCount = std::numeric_limits<unsigned>::max();
117 UP.Partial = true;
118
119 // Conditional branch in a loop back edge needs 3 additional exec
120 // manipulations in average.
121 UP.BEInsns += 3;
122
123 // We want to run unroll even for the loops which have been vectorized.
124 UP.UnrollVectorizedLoop = true;
125
126 // TODO: Do we want runtime unrolling?
127
128 // Maximum alloca size than can fit registers. Reserve 16 registers.
129 const unsigned MaxAlloca = (256 - 16) * 4;
130 unsigned ThresholdPrivate = UnrollThresholdPrivate;
131 unsigned ThresholdLocal = UnrollThresholdLocal;
132
133 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
134 // provided threshold value as the default for Threshold
135 if (MDNode *LoopUnrollThreshold =
136 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
137 if (LoopUnrollThreshold->getNumOperands() == 2) {
138 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
139 LoopUnrollThreshold->getOperand(1));
140 if (MetaThresholdValue) {
141 // We will also use the supplied value for PartialThreshold for now.
142 // We may introduce additional metadata if it becomes necessary in the
143 // future.
144 UP.Threshold = MetaThresholdValue->getSExtValue();
146 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
147 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
148 }
149 }
150 }
151
152 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
153 for (const BasicBlock *BB : L->getBlocks()) {
154 const DataLayout &DL = BB->getDataLayout();
155 unsigned LocalGEPsSeen = 0;
156
157 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
158 return SubLoop->contains(BB); }))
159 continue; // Block belongs to an inner loop.
160
161 for (const Instruction &I : *BB) {
162 // Unroll a loop which contains an "if" statement whose condition
163 // defined by a PHI belonging to the loop. This may help to eliminate
164 // if region and potentially even PHI itself, saving on both divergence
165 // and registers used for the PHI.
166 // Add a small bonus for each of such "if" statements.
167 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
168 if (UP.Threshold < MaxBoost && Br->isConditional()) {
169 BasicBlock *Succ0 = Br->getSuccessor(0);
170 BasicBlock *Succ1 = Br->getSuccessor(1);
171 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
172 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
173 continue;
174 if (dependsOnLocalPhi(L, Br->getCondition())) {
176 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
177 << " for loop:\n"
178 << *L << " due to " << *Br << '\n');
179 if (UP.Threshold >= MaxBoost)
180 return;
181 }
182 }
183 continue;
184 }
185
186 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
187 if (!GEP)
188 continue;
189
190 unsigned AS = GEP->getAddressSpace();
191 unsigned Threshold = 0;
193 Threshold = ThresholdPrivate;
195 Threshold = ThresholdLocal;
196 else
197 continue;
198
199 if (UP.Threshold >= Threshold)
200 continue;
201
202 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
203 const Value *Ptr = GEP->getPointerOperand();
204 const AllocaInst *Alloca =
205 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
206 if (!Alloca || !Alloca->isStaticAlloca())
207 continue;
208 Type *Ty = Alloca->getAllocatedType();
209 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
210 if (AllocaSize > MaxAlloca)
211 continue;
212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
214 LocalGEPsSeen++;
215 // Inhibit unroll for local memory if we have seen addressing not to
216 // a variable, most likely we will be unable to combine it.
217 // Do not unroll too deep inner loops for local memory to give a chance
218 // to unroll an outer loop for a more important reason.
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
220 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
221 !isa<Argument>(GEP->getPointerOperand())))
222 continue;
223 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
224 << *L << " due to LDS use.\n");
226 }
227
228 // Check if GEP depends on a value defined by this loop itself.
229 bool HasLoopDef = false;
230 for (const Value *Op : GEP->operands()) {
231 const Instruction *Inst = dyn_cast<Instruction>(Op);
232 if (!Inst || L->isLoopInvariant(Op))
233 continue;
234
235 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
236 return SubLoop->contains(Inst); }))
237 continue;
238 HasLoopDef = true;
239 break;
240 }
241 if (!HasLoopDef)
242 continue;
243
244 // We want to do whatever we can to limit the number of alloca
245 // instructions that make it through to the code generator. allocas
246 // require us to use indirect addressing, which is slow and prone to
247 // compiler bugs. If this loop does an address calculation on an
248 // alloca ptr, then we want to use a higher than normal loop unroll
249 // threshold. This will give SROA a better chance to eliminate these
250 // allocas.
251 //
252 // We also want to have more unrolling for local memory to let ds
253 // instructions with different offsets combine.
254 //
255 // Don't use the maximum allowed value here as it will make some
256 // programs way too big.
257 UP.Threshold = Threshold;
258 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
259 << " for loop:\n"
260 << *L << " due to " << *GEP << '\n');
261 if (UP.Threshold >= MaxBoost)
262 return;
263 }
264
265 // If we got a GEP in a small BB from inner loop then increase max trip
266 // count to analyze for better estimation cost in unroll
267 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
269 }
270}
271
275}
276
278 return 1024;
279}
280
281const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
282 // Codegen control options which don't matter.
283 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
284 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
285 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
286 AMDGPU::FeatureUnalignedAccessMode,
287
288 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
289
290 // Property of the kernel/environment which can't actually differ.
291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292 AMDGPU::FeatureTrapHandler,
293
294 // The default assumption needs to be ecc is enabled, but no directly
295 // exposed operations depend on it, so it can be safely inlined.
296 AMDGPU::FeatureSRAMECC,
297
298 // Perf-tuning features
299 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
300
302 : BaseT(TM, F.getDataLayout()),
303 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
304 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
305 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
306 SIModeRegisterDefaults Mode(F, *ST);
307 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
308 HasFP64FP16Denormals =
309 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
310}
311
313 return !F || !ST->isSingleLaneExecution(*F);
314}
315
316unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
317 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318 // registers. See getRegisterClassForType for the implementation.
319 // In this case vector registers are not vector in terms of
320 // VGPRs, but those which can hold multiple values.
321
322 // This is really the number of registers to fill when vectorizing /
323 // interleaving loops, so we lie to avoid trying to use all registers.
324 return 4;
325}
326
329 switch (K) {
331 return TypeSize::getFixed(32);
333 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
335 return TypeSize::getScalable(0);
336 }
337 llvm_unreachable("Unsupported register kind");
338}
339
341 return 32;
342}
343
344unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346 return 32 * 4 / ElemWidth;
347 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349 : 1;
350}
351
352unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
353 unsigned ChainSizeInBytes,
354 VectorType *VecTy) const {
355 unsigned VecRegBitWidth = VF * LoadSize;
356 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
357 // TODO: Support element-size less than 32bit?
358 return 128 / LoadSize;
359
360 return VF;
361}
362
363unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
364 unsigned ChainSizeInBytes,
365 VectorType *VecTy) const {
366 unsigned VecRegBitWidth = VF * StoreSize;
367 if (VecRegBitWidth > 128)
368 return 128 / StoreSize;
369
370 return VF;
371}
372
373unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
374 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
375 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
377 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
378 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
380 return 512;
381 }
382
383 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
384 return 8 * ST->getMaxPrivateElementSize();
385
386 // Common to flat, global, local and region. Assume for unknown addrspace.
387 return 128;
388}
389
390bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
391 Align Alignment,
392 unsigned AddrSpace) const {
393 // We allow vectorization of flat stores, even though we may need to decompose
394 // them later if they may access private memory. We don't have enough context
395 // here, and legalization can handle it.
396 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
397 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
398 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
399 }
400 return true;
401}
402
403bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
404 Align Alignment,
405 unsigned AddrSpace) const {
406 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
407}
408
409bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
410 Align Alignment,
411 unsigned AddrSpace) const {
412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
413}
414
416 return 1024;
417}
418
420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
422 std::optional<uint32_t> AtomicElementSize) const {
423
424 if (AtomicElementSize)
425 return Type::getIntNTy(Context, *AtomicElementSize * 8);
426
427 // 16-byte accesses achieve the highest copy throughput.
428 // If the operation has a fixed known length that is large enough, it is
429 // worthwhile to return an even wider type and let legalization lower it into
430 // multiple accesses, effectively unrolling the memcpy loop.
431 // We also rely on legalization to decompose into smaller accesses for
432 // subtargets and address spaces where it is necessary.
433 //
434 // Don't unroll if Length is not a constant, since unrolling leads to worse
435 // performance for length values that are smaller or slightly larger than the
436 // total size of the type returned here. Mitigating that would require a more
437 // complex lowering for variable-length memcpy and memmove.
438 unsigned I32EltsInVector = 4;
439 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
441 MemcpyLoopUnroll * I32EltsInVector);
442
443 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
444}
445
447 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
448 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
449 Align SrcAlign, Align DestAlign,
450 std::optional<uint32_t> AtomicCpySize) const {
451
452 if (AtomicCpySize)
454 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
455 DestAlign, AtomicCpySize);
456
457 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
458 while (RemainingBytes >= 16) {
459 OpsOut.push_back(I32x4Ty);
460 RemainingBytes -= 16;
461 }
462
463 Type *I64Ty = Type::getInt64Ty(Context);
464 while (RemainingBytes >= 8) {
465 OpsOut.push_back(I64Ty);
466 RemainingBytes -= 8;
467 }
468
469 Type *I32Ty = Type::getInt32Ty(Context);
470 while (RemainingBytes >= 4) {
471 OpsOut.push_back(I32Ty);
472 RemainingBytes -= 4;
473 }
474
475 Type *I16Ty = Type::getInt16Ty(Context);
476 while (RemainingBytes >= 2) {
477 OpsOut.push_back(I16Ty);
478 RemainingBytes -= 2;
479 }
480
481 Type *I8Ty = Type::getInt8Ty(Context);
482 while (RemainingBytes) {
483 OpsOut.push_back(I8Ty);
484 --RemainingBytes;
485 }
486}
487
489 // Disable unrolling if the loop is not vectorized.
490 // TODO: Enable this again.
491 if (VF.isScalar())
492 return 1;
493
494 return 8;
495}
496
498 MemIntrinsicInfo &Info) const {
499 switch (Inst->getIntrinsicID()) {
500 case Intrinsic::amdgcn_ds_ordered_add:
501 case Intrinsic::amdgcn_ds_ordered_swap: {
502 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
503 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
504 if (!Ordering || !Volatile)
505 return false; // Invalid.
506
507 unsigned OrderingVal = Ordering->getZExtValue();
508 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
509 return false;
510
511 Info.PtrVal = Inst->getArgOperand(0);
512 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
513 Info.ReadMem = true;
514 Info.WriteMem = true;
515 Info.IsVolatile = !Volatile->isZero();
516 return true;
517 }
518 default:
519 return false;
520 }
521}
522
524 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
527 const Instruction *CxtI) {
528
529 // Legalize the type.
530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
531 int ISD = TLI->InstructionOpcodeToISD(Opcode);
532
533 // Because we don't have any legal vector operations, but the legal types, we
534 // need to account for split vectors.
535 unsigned NElts = LT.second.isVector() ?
536 LT.second.getVectorNumElements() : 1;
537
538 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
539
540 switch (ISD) {
541 case ISD::SHL:
542 case ISD::SRL:
543 case ISD::SRA:
544 if (SLT == MVT::i64)
545 return get64BitInstrCost(CostKind) * LT.first * NElts;
546
547 if (ST->has16BitInsts() && SLT == MVT::i16)
548 NElts = (NElts + 1) / 2;
549
550 // i32
551 return getFullRateInstrCost() * LT.first * NElts;
552 case ISD::ADD:
553 case ISD::SUB:
554 case ISD::AND:
555 case ISD::OR:
556 case ISD::XOR:
557 if (SLT == MVT::i64) {
558 // and, or and xor are typically split into 2 VALU instructions.
559 return 2 * getFullRateInstrCost() * LT.first * NElts;
560 }
561
562 if (ST->has16BitInsts() && SLT == MVT::i16)
563 NElts = (NElts + 1) / 2;
564
565 return LT.first * NElts * getFullRateInstrCost();
566 case ISD::MUL: {
567 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
568 if (SLT == MVT::i64) {
569 const int FullRateCost = getFullRateInstrCost();
570 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
571 }
572
573 if (ST->has16BitInsts() && SLT == MVT::i16)
574 NElts = (NElts + 1) / 2;
575
576 // i32
577 return QuarterRateCost * NElts * LT.first;
578 }
579 case ISD::FMUL:
580 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
581 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
582 // fused operation.
583 if (CxtI && CxtI->hasOneUse())
584 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
585 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
586 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
587 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
589 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
591
592 // Estimate all types may be fused with contract/unsafe flags
594 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
595 Options.UnsafeFPMath ||
596 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
598 }
599 }
600 [[fallthrough]];
601 case ISD::FADD:
602 case ISD::FSUB:
603 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
604 NElts = (NElts + 1) / 2;
605 if (SLT == MVT::f64)
606 return LT.first * NElts * get64BitInstrCost(CostKind);
607
608 if (ST->has16BitInsts() && SLT == MVT::f16)
609 NElts = (NElts + 1) / 2;
610
611 if (SLT == MVT::f32 || SLT == MVT::f16)
612 return LT.first * NElts * getFullRateInstrCost();
613 break;
614 case ISD::FDIV:
615 case ISD::FREM:
616 // FIXME: frem should be handled separately. The fdiv in it is most of it,
617 // but the current lowering is also not entirely correct.
618 if (SLT == MVT::f64) {
619 int Cost = 7 * get64BitInstrCost(CostKind) +
620 getQuarterRateInstrCost(CostKind) +
621 3 * getHalfRateInstrCost(CostKind);
622 // Add cost of workaround.
624 Cost += 3 * getFullRateInstrCost();
625
626 return LT.first * Cost * NElts;
627 }
628
629 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
630 // TODO: This is more complicated, unsafe flags etc.
631 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
632 (SLT == MVT::f16 && ST->has16BitInsts())) {
633 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
634 }
635 }
636
637 if (SLT == MVT::f16 && ST->has16BitInsts()) {
638 // 2 x v_cvt_f32_f16
639 // f32 rcp
640 // f32 fmul
641 // v_cvt_f16_f32
642 // f16 div_fixup
643 int Cost =
644 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
645 return LT.first * Cost * NElts;
646 }
647
648 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
650 // Fast unsafe fdiv lowering:
651 // f32 rcp
652 // f32 fmul
653 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
654 return LT.first * Cost * NElts;
655 }
656
657 if (SLT == MVT::f32 || SLT == MVT::f16) {
658 // 4 more v_cvt_* insts without f16 insts support
659 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
660 1 * getQuarterRateInstrCost(CostKind);
661
662 if (!HasFP32Denormals) {
663 // FP mode switches.
664 Cost += 2 * getFullRateInstrCost();
665 }
666
667 return LT.first * NElts * Cost;
668 }
669 break;
670 case ISD::FNEG:
671 // Use the backend' estimation. If fneg is not free each element will cost
672 // one additional instruction.
673 return TLI->isFNegFree(SLT) ? 0 : NElts;
674 default:
675 break;
676 }
677
678 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
679 Args, CxtI);
680}
681
682// Return true if there's a potential benefit from using v2f16/v2i16
683// instructions for an intrinsic, even if it requires nontrivial legalization.
685 switch (ID) {
686 case Intrinsic::fma:
687 case Intrinsic::fmuladd:
688 case Intrinsic::copysign:
689 case Intrinsic::canonicalize:
690 // There's a small benefit to using vector ops in the legalized code.
691 case Intrinsic::round:
692 case Intrinsic::uadd_sat:
693 case Intrinsic::usub_sat:
694 case Intrinsic::sadd_sat:
695 case Intrinsic::ssub_sat:
696 case Intrinsic::abs:
697 return true;
698 default:
699 return false;
700 }
701}
702
706 if (ICA.getID() == Intrinsic::fabs)
707 return 0;
708
711
712 Type *RetTy = ICA.getReturnType();
713
714 // Legalize the type.
715 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
716
717 unsigned NElts = LT.second.isVector() ?
718 LT.second.getVectorNumElements() : 1;
719
720 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
721
722 if (SLT == MVT::f64)
723 return LT.first * NElts * get64BitInstrCost(CostKind);
724
725 if ((ST->has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
726 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
727 NElts = (NElts + 1) / 2;
728
729 // TODO: Get more refined intrinsic costs?
730 unsigned InstRate = getQuarterRateInstrCost(CostKind);
731
732 switch (ICA.getID()) {
733 case Intrinsic::fma:
734 case Intrinsic::fmuladd:
735 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
736 InstRate = getFullRateInstrCost();
737 else {
738 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
739 : getQuarterRateInstrCost(CostKind);
740 }
741 break;
742 case Intrinsic::copysign:
743 return NElts * getFullRateInstrCost();
744 case Intrinsic::canonicalize: {
745 assert(SLT != MVT::f64);
746 InstRate = getFullRateInstrCost();
747 break;
748 }
749 case Intrinsic::uadd_sat:
750 case Intrinsic::usub_sat:
751 case Intrinsic::sadd_sat:
752 case Intrinsic::ssub_sat: {
753 if (SLT == MVT::i16 || SLT == MVT::i32)
754 InstRate = getFullRateInstrCost();
755
756 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
757 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
758 NElts = 1;
759 break;
760 }
761 case Intrinsic::abs:
762 // Expansion takes 2 instructions for VALU
763 if (SLT == MVT::i16 || SLT == MVT::i32)
764 InstRate = 2 * getFullRateInstrCost();
765 break;
766 default:
767 break;
768 }
769
770 return LT.first * NElts * InstRate;
771}
772
775 const Instruction *I) {
776 assert((I == nullptr || I->getOpcode() == Opcode) &&
777 "Opcode should reflect passed instruction.");
778 const bool SCost =
780 const int CBrCost = SCost ? 5 : 7;
781 switch (Opcode) {
782 case Instruction::Br: {
783 // Branch instruction takes about 4 slots on gfx900.
784 const auto *BI = dyn_cast_or_null<BranchInst>(I);
785 if (BI && BI->isUnconditional())
786 return SCost ? 1 : 4;
787 // Suppose conditional branch takes additional 3 exec manipulations
788 // instructions in average.
789 return CBrCost;
790 }
791 case Instruction::Switch: {
792 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
793 // Each case (including default) takes 1 cmp + 1 cbr instructions in
794 // average.
795 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
796 }
797 case Instruction::Ret:
798 return SCost ? 1 : 10;
799 }
800 return BaseT::getCFInstrCost(Opcode, CostKind, I);
801}
802
805 std::optional<FastMathFlags> FMF,
808 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
809
810 EVT OrigTy = TLI->getValueType(DL, Ty);
811
812 // Computes cost on targets that have packed math instructions(which support
813 // 16-bit types only).
814 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
815 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
816
817 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
818 return LT.first * getFullRateInstrCost();
819}
820
823 FastMathFlags FMF,
825 EVT OrigTy = TLI->getValueType(DL, Ty);
826
827 // Computes cost on targets that have packed math instructions(which support
828 // 16-bit types only).
829 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
830 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
831
832 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
833 return LT.first * getHalfRateInstrCost(CostKind);
834}
835
838 unsigned Index, Value *Op0,
839 Value *Op1) {
840 switch (Opcode) {
841 case Instruction::ExtractElement:
842 case Instruction::InsertElement: {
843 unsigned EltSize
844 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
845 if (EltSize < 32) {
846 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
847 return 0;
848 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
849 Op1);
850 }
851
852 // Extracts are just reads of a subregister, so are free. Inserts are
853 // considered free because we don't want to have any cost for scalarizing
854 // operations, and we don't have to copy into a different register class.
855
856 // Dynamic indexing isn't free and is best avoided.
857 return Index == ~0u ? 2 : 0;
858 }
859 default:
860 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
861 }
862}
863
864/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
865/// this is analyzing the collective result of all output registers. Otherwise,
866/// this is only querying a specific result index if this returns multiple
867/// registers in a struct.
869 const CallInst *CI, ArrayRef<unsigned> Indices) const {
870 // TODO: Handle complex extract indices
871 if (Indices.size() > 1)
872 return true;
873
874 const DataLayout &DL = CI->getDataLayout();
875 const SIRegisterInfo *TRI = ST->getRegisterInfo();
876 TargetLowering::AsmOperandInfoVector TargetConstraints =
877 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
878
879 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
880
881 int OutputIdx = 0;
882 for (auto &TC : TargetConstraints) {
883 if (TC.Type != InlineAsm::isOutput)
884 continue;
885
886 // Skip outputs we don't care about.
887 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
888 continue;
889
891
893 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
894
895 // For AGPR constraints null is returned on subtargets without AGPRs, so
896 // assume divergent for null.
897 if (!RC || !TRI->isSGPRClass(RC))
898 return true;
899 }
900
901 return false;
902}
903
905 const IntrinsicInst *ReadReg) const {
906 Metadata *MD =
907 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
909 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
910
911 // Special case registers that look like VCC.
912 MVT VT = MVT::getVT(ReadReg->getType());
913 if (VT == MVT::i1)
914 return true;
915
916 // Special case scalar registers that start with 'v'.
917 if (RegName.starts_with("vcc") || RegName.empty())
918 return false;
919
920 // VGPR or AGPR is divergent. There aren't any specially named vector
921 // registers.
922 return RegName[0] == 'v' || RegName[0] == 'a';
923}
924
925/// \returns true if the result of the value could potentially be
926/// different across workitems in a wavefront.
928 if (const Argument *A = dyn_cast<Argument>(V))
930
931 // Loads from the private and flat address spaces are divergent, because
932 // threads can execute the load instruction with the same inputs and get
933 // different results.
934 //
935 // All other loads are not divergent, because if threads issue loads with the
936 // same arguments, they will always get the same result.
937 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
938 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
939 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
940
941 // Atomics are divergent because they are executed sequentially: when an
942 // atomic operation refers to the same address in each thread, then each
943 // thread after the first sees the value written by the previous thread as
944 // original value.
945 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
946 return true;
947
948 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
949 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
950 return isReadRegisterSourceOfDivergence(Intrinsic);
951
952 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
953 }
954
955 // Assume all function calls are a source of divergence.
956 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
957 if (CI->isInlineAsm())
959 return true;
960 }
961
962 // Assume all function calls are a source of divergence.
963 if (isa<InvokeInst>(V))
964 return true;
965
966 return false;
967}
968
969bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
970 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
971 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
972
973 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
974 if (CI->isInlineAsm())
976 return false;
977 }
978
979 // In most cases TID / wavefrontsize is uniform.
980 //
981 // However, if a kernel has uneven dimesions we can have a value of
982 // workitem-id-x divided by the wavefrontsize non-uniform. For example
983 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
984 // packed into a same wave which gives 1 and 0 after the division by 64
985 // respectively.
986 //
987 // FIXME: limit it to 1D kernels only, although that shall be possible
988 // to perform this optimization is the size of the X dimension is a power
989 // of 2, we just do not currently have infrastructure to query it.
990 using namespace llvm::PatternMatch;
991 uint64_t C;
992 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
993 m_ConstantInt(C))) ||
994 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
995 m_ConstantInt(C)))) {
996 const Function *F = cast<Instruction>(V)->getFunction();
997 return C >= ST->getWavefrontSizeLog2() &&
998 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
999 }
1000
1001 Value *Mask;
1002 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1003 m_Value(Mask)))) {
1004 const Function *F = cast<Instruction>(V)->getFunction();
1005 const DataLayout &DL = F->getDataLayout();
1006 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1007 ST->getWavefrontSizeLog2() &&
1008 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1009 }
1010
1011 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1012 if (!ExtValue)
1013 return false;
1014
1015 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1016 if (!CI)
1017 return false;
1018
1019 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1020 switch (Intrinsic->getIntrinsicID()) {
1021 default:
1022 return false;
1023 case Intrinsic::amdgcn_if:
1024 case Intrinsic::amdgcn_else: {
1025 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1026 return Indices.size() == 1 && Indices[0] == 1;
1027 }
1028 }
1029 }
1030
1031 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1032 // divergent for the overall struct return. We need to override it in the
1033 // case we're extracting an SGPR component here.
1034 if (CI->isInlineAsm())
1035 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1036
1037 return false;
1038}
1039
1041 Intrinsic::ID IID) const {
1042 switch (IID) {
1043 case Intrinsic::amdgcn_is_shared:
1044 case Intrinsic::amdgcn_is_private:
1045 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1046 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1047 OpIndexes.push_back(0);
1048 return true;
1049 default:
1050 return false;
1051 }
1052}
1053
1055 Value *OldV,
1056 Value *NewV) const {
1057 auto IntrID = II->getIntrinsicID();
1058 switch (IntrID) {
1059 case Intrinsic::amdgcn_is_shared:
1060 case Intrinsic::amdgcn_is_private: {
1061 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1063 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064 LLVMContext &Ctx = NewV->getType()->getContext();
1065 ConstantInt *NewVal = (TrueAS == NewAS) ?
1067 return NewVal;
1068 }
1069 case Intrinsic::ptrmask: {
1070 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072 Value *MaskOp = II->getArgOperand(1);
1073 Type *MaskTy = MaskOp->getType();
1074
1075 bool DoTruncate = false;
1076
1077 const GCNTargetMachine &TM =
1078 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1080 // All valid 64-bit to 32-bit casts work by chopping off the high
1081 // bits. Any masking only clearing the low bits will also apply in the new
1082 // address space.
1083 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1084 DL.getPointerSizeInBits(NewAS) != 32)
1085 return nullptr;
1086
1087 // TODO: Do we need to thread more context in here?
1088 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1089 if (Known.countMinLeadingOnes() < 32)
1090 return nullptr;
1091
1092 DoTruncate = true;
1093 }
1094
1095 IRBuilder<> B(II);
1096 if (DoTruncate) {
1097 MaskTy = B.getInt32Ty();
1098 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1099 }
1100
1101 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102 {NewV, MaskOp});
1103 }
1104 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1105 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1106 Type *DestTy = II->getType();
1107 Type *SrcTy = NewV->getType();
1108 unsigned NewAS = SrcTy->getPointerAddressSpace();
1110 return nullptr;
1111 Module *M = II->getModule();
1113 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1114 II->setArgOperand(0, NewV);
1115 II->setCalledFunction(NewDecl);
1116 return II;
1117 }
1118 default:
1119 return nullptr;
1120 }
1121}
1122
1124 VectorType *VT, ArrayRef<int> Mask,
1126 int Index, VectorType *SubTp,
1128 const Instruction *CxtI) {
1129 if (!isa<FixedVectorType>(VT))
1130 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1131
1132 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1133
1134 // Larger vector widths may require additional instructions, but are
1135 // typically cheaper than scalarized versions.
1136 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1138 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1139 bool HasVOP3P = ST->hasVOP3PInsts();
1140 unsigned RequestedElts =
1141 count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1142 if (RequestedElts == 0)
1143 return 0;
1144 switch (Kind) {
1145 case TTI::SK_Broadcast:
1146 case TTI::SK_Reverse:
1148 // With op_sel VOP3P instructions freely can access the low half or high
1149 // half of a register, so any swizzle of two elements is free.
1150 if (HasVOP3P && NumVectorElts == 2)
1151 return 0;
1152 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1153 // SK_Broadcast just reuses the same mask
1154 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1155 return NumPerms + NumPermMasks;
1156 }
1159 // Even aligned accesses are free
1160 if (!(Index % 2))
1161 return 0;
1162 // Insert/extract subvectors only require shifts / extract code to get the
1163 // relevant bits
1164 return alignTo(RequestedElts, 2) / 2;
1165 }
1167 case TTI::SK_Splice:
1168 case TTI::SK_Select: {
1169 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1170 // SK_Select just reuses the same mask
1171 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1172 return NumPerms + NumPermMasks;
1173 }
1174
1175 default:
1176 break;
1177 }
1178 }
1179
1180 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1181}
1182
1183/// Whether it is profitable to sink the operands of an
1184/// Instruction I to the basic block of I.
1185/// This helps using several modifiers (like abs and neg) more often.
1187 SmallVectorImpl<Use *> &Ops) const {
1188 using namespace PatternMatch;
1189
1190 for (auto &Op : I->operands()) {
1191 // Ensure we are not already sinking this operand.
1192 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1193 continue;
1194
1195 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
1196 Ops.push_back(&Op);
1197 }
1198
1199 return !Ops.empty();
1200}
1201
1203 const Function *Callee) const {
1204 const TargetMachine &TM = getTLI()->getTargetMachine();
1205 const GCNSubtarget *CallerST
1206 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1207 const GCNSubtarget *CalleeST
1208 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1209
1210 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1211 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1212
1213 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1214 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1215 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1216 return false;
1217
1218 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1219 // no way to support merge for backend defined attributes.
1220 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1221 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1222 if (!CallerMode.isInlineCompatible(CalleeMode))
1223 return false;
1224
1225 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1226 Callee->hasFnAttribute(Attribute::InlineHint))
1227 return true;
1228
1229 // Hack to make compile times reasonable.
1230 if (InlineMaxBB) {
1231 // Single BB does not increase total BB amount.
1232 if (Callee->size() == 1)
1233 return true;
1234 size_t BBSize = Caller->size() + Callee->size() - 1;
1235 return BBSize <= InlineMaxBB;
1236 }
1237
1238 return true;
1239}
1240
1242 const SITargetLowering *TLI,
1243 const GCNTTIImpl *TTIImpl) {
1244 const int NrOfSGPRUntilSpill = 26;
1245 const int NrOfVGPRUntilSpill = 32;
1246
1247 const DataLayout &DL = TTIImpl->getDataLayout();
1248
1249 unsigned adjustThreshold = 0;
1250 int SGPRsInUse = 0;
1251 int VGPRsInUse = 0;
1252 for (const Use &A : CB->args()) {
1253 SmallVector<EVT, 4> ValueVTs;
1254 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1255 for (auto ArgVT : ValueVTs) {
1256 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1257 CB->getContext(), CB->getCallingConv(), ArgVT);
1259 SGPRsInUse += CCRegNum;
1260 else
1261 VGPRsInUse += CCRegNum;
1262 }
1263 }
1264
1265 // The cost of passing function arguments through the stack:
1266 // 1 instruction to put a function argument on the stack in the caller.
1267 // 1 instruction to take a function argument from the stack in callee.
1268 // 1 instruction is explicitly take care of data dependencies in callee
1269 // function.
1270 InstructionCost ArgStackCost(1);
1271 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1272 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1274 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1275 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1277
1278 // The penalty cost is computed relative to the cost of instructions and does
1279 // not model any storage costs.
1280 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1281 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1282 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1283 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1284 return adjustThreshold;
1285}
1286
1287static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1288 const DataLayout &DL) {
1289 // If we have a pointer to a private array passed into a function
1290 // it will not be optimized out, leaving scratch usage.
1291 // This function calculates the total size in bytes of the memory that would
1292 // end in scratch if the call was not inlined.
1293 unsigned AllocaSize = 0;
1295 for (Value *PtrArg : CB->args()) {
1296 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1297 if (!Ty)
1298 continue;
1299
1300 unsigned AddrSpace = Ty->getAddressSpace();
1301 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1302 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1303 continue;
1304
1305 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1306 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1307 continue;
1308
1309 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1310 }
1311 return AllocaSize;
1312}
1313
1317}
1318
1320 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1321
1322 // Private object passed as arguments may end up in scratch usage if the call
1323 // is not inlined. Increase the inline threshold to promote inlining.
1324 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1325 if (AllocaSize > 0)
1326 Threshold += ArgAllocaCost;
1327 return Threshold;
1328}
1329
1331 const AllocaInst *AI) const {
1332
1333 // Below the cutoff, assume that the private memory objects would be
1334 // optimized
1335 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1336 if (AllocaSize <= ArgAllocaCutoff)
1337 return 0;
1338
1339 // Above the cutoff, we give a cost to each private memory object
1340 // depending its size. If the array can be optimized by SROA this cost is not
1341 // added to the total-cost in the inliner cost analysis.
1342 //
1343 // We choose the total cost of the alloca such that their sum cancels the
1344 // bonus given in the threshold (ArgAllocaCost).
1345 //
1346 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1347 //
1348 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1349 // the single-bb bonus and the vector-bonus.
1350 //
1351 // We compensate the first two multipliers, by repeating logic from the
1352 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1353 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1354 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1355
1356 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1357 return BB.getTerminator()->getNumSuccessors() > 1;
1358 });
1359 if (SingleBB) {
1360 Threshold += Threshold / 2;
1361 }
1362
1363 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1364
1365 // Attribute the bonus proportionally to the alloca size
1366 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1367
1368 return AllocaThresholdBonus;
1369}
1370
1374 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1375}
1376
1379 CommonTTI.getPeelingPreferences(L, SE, PP);
1380}
1381
1382int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1383 return ST->hasFullRate64Ops()
1384 ? getFullRateInstrCost()
1385 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1386 : getQuarterRateInstrCost(CostKind);
1387}
1388
1389std::pair<InstructionCost, MVT>
1390GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1391 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1392 auto Size = DL.getTypeSizeInBits(Ty);
1393 // Maximum load or store can handle 8 dwords for scalar and 4 for
1394 // vector ALU. Let's assume anything above 8 dwords is expensive
1395 // even if legal.
1396 if (Size <= 256)
1397 return Cost;
1398
1399 Cost.first += (Size + 255) / 256;
1400 return Cost;
1401}
1402
1404 return ST->hasPrefetch() ? 128 : 0;
1405}
1406
1409}
1410
1412 const Function &F,
1413 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1414 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1415 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1416 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1417 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1418 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1420 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1421 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1422 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1423 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1424 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1425}
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
Hexagon Common GEP
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasMadMacF32Insts() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
Definition: Instructions.h:63
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1408
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1399
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1317
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
Generation getGeneration() const
Definition: GCNSubtarget.h:327
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
int getInliningLastCallToStaticBonus() const
unsigned getNumberOfRegisters(unsigned RCID) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool hasBranchDivergence(const Function *F=nullptr) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Metadata node.
Definition: Metadata.h:1073
Machine Value Type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:81
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
const DataLayout & getDataLayout() const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
Type * getElementType() const
Definition: DerivedTypes.h:460
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:756
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:480
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1067
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ FAdd
Sum of floats.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:243
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...