LLVM 22.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(2700), cl::Hidden);
40
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(1000), cl::Hidden);
45
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(200), cl::Hidden);
50
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(true), cl::Hidden);
55
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering memcpy as a loop"),
84 cl::init(16), cl::Hidden);
85
86static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
87 unsigned Depth = 0) {
88 const Instruction *I = dyn_cast<Instruction>(Cond);
89 if (!I)
90 return false;
91
92 for (const Value *V : I->operand_values()) {
93 if (!L->contains(I))
94 continue;
95 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
96 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
97 return SubLoop->contains(PHI); }))
98 return true;
99 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
100 return true;
101 }
102 return false;
103}
104
106 : BaseT(TM, F.getDataLayout()),
107 TargetTriple(TM->getTargetTriple()),
108 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
109 TLI(ST->getTargetLowering()) {}
110
113 OptimizationRemarkEmitter *ORE) const {
114 const Function &F = *L->getHeader()->getParent();
115 UP.Threshold =
116 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
117 UP.MaxCount = std::numeric_limits<unsigned>::max();
118 UP.Partial = true;
119
120 // Conditional branch in a loop back edge needs 3 additional exec
121 // manipulations in average.
122 UP.BEInsns += 3;
123
124 // We want to run unroll even for the loops which have been vectorized.
125 UP.UnrollVectorizedLoop = true;
126
127 // TODO: Do we want runtime unrolling?
128
129 // Maximum alloca size than can fit registers. Reserve 16 registers.
130 const unsigned MaxAlloca = (256 - 16) * 4;
131 unsigned ThresholdPrivate = UnrollThresholdPrivate;
132 unsigned ThresholdLocal = UnrollThresholdLocal;
133
134 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
135 // provided threshold value as the default for Threshold
136 if (MDNode *LoopUnrollThreshold =
137 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
138 if (LoopUnrollThreshold->getNumOperands() == 2) {
139 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
140 LoopUnrollThreshold->getOperand(1));
141 if (MetaThresholdValue) {
142 // We will also use the supplied value for PartialThreshold for now.
143 // We may introduce additional metadata if it becomes necessary in the
144 // future.
145 UP.Threshold = MetaThresholdValue->getSExtValue();
147 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
148 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
149 }
150 }
151 }
152
153 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
154 for (const BasicBlock *BB : L->getBlocks()) {
155 const DataLayout &DL = BB->getDataLayout();
156 unsigned LocalGEPsSeen = 0;
157
158 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
159 return SubLoop->contains(BB); }))
160 continue; // Block belongs to an inner loop.
161
162 for (const Instruction &I : *BB) {
163 // Unroll a loop which contains an "if" statement whose condition
164 // defined by a PHI belonging to the loop. This may help to eliminate
165 // if region and potentially even PHI itself, saving on both divergence
166 // and registers used for the PHI.
167 // Add a small bonus for each of such "if" statements.
168 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
169 if (UP.Threshold < MaxBoost && Br->isConditional()) {
170 BasicBlock *Succ0 = Br->getSuccessor(0);
171 BasicBlock *Succ1 = Br->getSuccessor(1);
172 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
173 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
174 continue;
175 if (dependsOnLocalPhi(L, Br->getCondition())) {
177 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
178 << " for loop:\n"
179 << *L << " due to " << *Br << '\n');
180 if (UP.Threshold >= MaxBoost)
181 return;
182 }
183 }
184 continue;
185 }
186
187 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
188 if (!GEP)
189 continue;
190
191 unsigned AS = GEP->getAddressSpace();
192 unsigned Threshold = 0;
194 Threshold = ThresholdPrivate;
196 Threshold = ThresholdLocal;
197 else
198 continue;
199
200 if (UP.Threshold >= Threshold)
201 continue;
202
203 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
204 const Value *Ptr = GEP->getPointerOperand();
205 const AllocaInst *Alloca =
206 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
207 if (!Alloca || !Alloca->isStaticAlloca())
208 continue;
209 Type *Ty = Alloca->getAllocatedType();
210 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
211 if (AllocaSize > MaxAlloca)
212 continue;
213 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
215 LocalGEPsSeen++;
216 // Inhibit unroll for local memory if we have seen addressing not to
217 // a variable, most likely we will be unable to combine it.
218 // Do not unroll too deep inner loops for local memory to give a chance
219 // to unroll an outer loop for a more important reason.
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
221 continue;
222
223 const Value *V = getUnderlyingObject(GEP->getPointerOperand());
224 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
225 continue;
226
227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
228 << *L << " due to LDS use.\n");
230 }
231
232 // Check if GEP depends on a value defined by this loop itself.
233 bool HasLoopDef = false;
234 for (const Value *Op : GEP->operands()) {
235 const Instruction *Inst = dyn_cast<Instruction>(Op);
236 if (!Inst || L->isLoopInvariant(Op))
237 continue;
238
239 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
240 return SubLoop->contains(Inst); }))
241 continue;
242 HasLoopDef = true;
243 break;
244 }
245 if (!HasLoopDef)
246 continue;
247
248 // We want to do whatever we can to limit the number of alloca
249 // instructions that make it through to the code generator. allocas
250 // require us to use indirect addressing, which is slow and prone to
251 // compiler bugs. If this loop does an address calculation on an
252 // alloca ptr, then we want to use a higher than normal loop unroll
253 // threshold. This will give SROA a better chance to eliminate these
254 // allocas.
255 //
256 // We also want to have more unrolling for local memory to let ds
257 // instructions with different offsets combine.
258 //
259 // Don't use the maximum allowed value here as it will make some
260 // programs way too big.
261 UP.Threshold = Threshold;
262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
263 << " for loop:\n"
264 << *L << " due to " << *GEP << '\n');
265 if (UP.Threshold >= MaxBoost)
266 return;
267 }
268
269 // If we got a GEP in a small BB from inner loop then increase max trip
270 // count to analyze for better estimation cost in unroll
271 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
273 }
274}
275
277 TTI::PeelingPreferences &PP) const {
279}
280
282 return 1024;
283}
284
285const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
286 // Codegen control options which don't matter.
287 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
288 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
289 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
290 AMDGPU::FeatureUnalignedAccessMode,
291
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
293
294 // Property of the kernel/environment which can't actually differ.
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureTrapHandler,
297
298 // The default assumption needs to be ecc is enabled, but no directly
299 // exposed operations depend on it, so it can be safely inlined.
300 AMDGPU::FeatureSRAMECC,
301
302 // Perf-tuning features
303 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
304
306 : BaseT(TM, F.getDataLayout()),
307 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
308 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
309 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
310 SIModeRegisterDefaults Mode(F, *ST);
311 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
312 HasFP64FP16Denormals =
313 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
314}
315
317 return !F || !ST->isSingleLaneExecution(*F);
318}
319
320unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
321 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
322 // registers. See getRegisterClassForType for the implementation.
323 // In this case vector registers are not vector in terms of
324 // VGPRs, but those which can hold multiple values.
325
326 // This is really the number of registers to fill when vectorizing /
327 // interleaving loops, so we lie to avoid trying to use all registers.
328 return 4;
329}
330
333 switch (K) {
335 return TypeSize::getFixed(32);
337 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
339 return TypeSize::getScalable(0);
340 }
341 llvm_unreachable("Unsupported register kind");
342}
343
345 return 32;
346}
347
348unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
349 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
350 return 32 * 4 / ElemWidth;
351 // For a given width return the max 0number of elements that can be combined
352 // into a wider bit value:
353 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
354 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
355 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356 : 1;
357}
358
359unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
360 unsigned ChainSizeInBytes,
361 VectorType *VecTy) const {
362 unsigned VecRegBitWidth = VF * LoadSize;
363 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
364 // TODO: Support element-size less than 32bit?
365 return 128 / LoadSize;
366
367 return VF;
368}
369
370unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
371 unsigned ChainSizeInBytes,
372 VectorType *VecTy) const {
373 unsigned VecRegBitWidth = VF * StoreSize;
374 if (VecRegBitWidth > 128)
375 return 128 / StoreSize;
376
377 return VF;
378}
379
380unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
381 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
382 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
384 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
385 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
387 return 512;
388 }
389
390 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
391 return 8 * ST->getMaxPrivateElementSize();
392
393 // Common to flat, global, local and region. Assume for unknown addrspace.
394 return 128;
395}
396
397bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
398 Align Alignment,
399 unsigned AddrSpace) const {
400 // We allow vectorization of flat stores, even though we may need to decompose
401 // them later if they may access private memory. We don't have enough context
402 // here, and legalization can handle it.
403 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
404 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
405 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
406 }
407 return true;
408}
409
410bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
411 Align Alignment,
412 unsigned AddrSpace) const {
413 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
414}
415
416bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
417 Align Alignment,
418 unsigned AddrSpace) const {
419 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
420}
421
423 return 1024;
424}
425
427 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
428 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
429 std::optional<uint32_t> AtomicElementSize) const {
430
431 if (AtomicElementSize)
432 return Type::getIntNTy(Context, *AtomicElementSize * 8);
433
434 // 16-byte accesses achieve the highest copy throughput.
435 // If the operation has a fixed known length that is large enough, it is
436 // worthwhile to return an even wider type and let legalization lower it into
437 // multiple accesses, effectively unrolling the memcpy loop.
438 // We also rely on legalization to decompose into smaller accesses for
439 // subtargets and address spaces where it is necessary.
440 //
441 // Don't unroll if Length is not a constant, since unrolling leads to worse
442 // performance for length values that are smaller or slightly larger than the
443 // total size of the type returned here. Mitigating that would require a more
444 // complex lowering for variable-length memcpy and memmove.
445 unsigned I32EltsInVector = 4;
446 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
448 MemcpyLoopUnroll * I32EltsInVector);
449
450 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
451}
452
454 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
455 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
456 Align SrcAlign, Align DestAlign,
457 std::optional<uint32_t> AtomicCpySize) const {
458
459 if (AtomicCpySize)
461 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
462 DestAlign, AtomicCpySize);
463
465 while (RemainingBytes >= 16) {
466 OpsOut.push_back(I32x4Ty);
467 RemainingBytes -= 16;
468 }
469
470 Type *I64Ty = Type::getInt64Ty(Context);
471 while (RemainingBytes >= 8) {
472 OpsOut.push_back(I64Ty);
473 RemainingBytes -= 8;
474 }
475
476 Type *I32Ty = Type::getInt32Ty(Context);
477 while (RemainingBytes >= 4) {
478 OpsOut.push_back(I32Ty);
479 RemainingBytes -= 4;
480 }
481
482 Type *I16Ty = Type::getInt16Ty(Context);
483 while (RemainingBytes >= 2) {
484 OpsOut.push_back(I16Ty);
485 RemainingBytes -= 2;
486 }
487
489 while (RemainingBytes) {
490 OpsOut.push_back(I8Ty);
491 --RemainingBytes;
492 }
493}
494
496 // Disable unrolling if the loop is not vectorized.
497 // TODO: Enable this again.
498 if (VF.isScalar())
499 return 1;
500
501 return 8;
502}
503
505 MemIntrinsicInfo &Info) const {
506 switch (Inst->getIntrinsicID()) {
507 case Intrinsic::amdgcn_ds_ordered_add:
508 case Intrinsic::amdgcn_ds_ordered_swap: {
509 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
510 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
511 if (!Ordering || !Volatile)
512 return false; // Invalid.
513
514 unsigned OrderingVal = Ordering->getZExtValue();
515 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
516 return false;
517
518 Info.PtrVal = Inst->getArgOperand(0);
519 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
520 Info.ReadMem = true;
521 Info.WriteMem = true;
522 Info.IsVolatile = !Volatile->isZero();
523 return true;
524 }
525 default:
526 return false;
527 }
528}
529
531 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
533 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
534
535 // Legalize the type.
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537 int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539 // Because we don't have any legal vector operations, but the legal types, we
540 // need to account for split vectors.
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
543
544 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546 switch (ISD) {
547 case ISD::SHL:
548 case ISD::SRL:
549 case ISD::SRA:
550 if (SLT == MVT::i64)
551 return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553 if (ST->has16BitInsts() && SLT == MVT::i16)
554 NElts = (NElts + 1) / 2;
555
556 // i32
557 return getFullRateInstrCost() * LT.first * NElts;
558 case ISD::ADD:
559 case ISD::SUB:
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
597
598 // Estimate all types may be fused with contract/unsafe flags
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603 }
604 }
605 [[fallthrough]];
606 case ISD::FADD:
607 case ISD::FSUB:
608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609 NElts = (NElts + 1) / 2;
610 if (SLT == MVT::f64)
611 return LT.first * NElts * get64BitInstrCost(CostKind);
612
613 if (ST->has16BitInsts() && SLT == MVT::f16)
614 NElts = (NElts + 1) / 2;
615
616 if (SLT == MVT::f32 || SLT == MVT::f16)
617 return LT.first * NElts * getFullRateInstrCost();
618 break;
619 case ISD::FDIV:
620 case ISD::FREM:
621 // FIXME: frem should be handled separately. The fdiv in it is most of it,
622 // but the current lowering is also not entirely correct.
623 if (SLT == MVT::f64) {
624 int Cost = 7 * get64BitInstrCost(CostKind) +
625 getQuarterRateInstrCost(CostKind) +
626 3 * getHalfRateInstrCost(CostKind);
627 // Add cost of workaround.
629 Cost += 3 * getFullRateInstrCost();
630
631 return LT.first * Cost * NElts;
632 }
633
634 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
635 // TODO: This is more complicated, unsafe flags etc.
636 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
637 (SLT == MVT::f16 && ST->has16BitInsts())) {
638 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
639 }
640 }
641
642 if (SLT == MVT::f16 && ST->has16BitInsts()) {
643 // 2 x v_cvt_f32_f16
644 // f32 rcp
645 // f32 fmul
646 // v_cvt_f16_f32
647 // f16 div_fixup
648 int Cost =
649 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
650 return LT.first * Cost * NElts;
651 }
652
653 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
654 // Fast unsafe fdiv lowering:
655 // f32 rcp
656 // f32 fmul
657 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
658 return LT.first * Cost * NElts;
659 }
660
661 if (SLT == MVT::f32 || SLT == MVT::f16) {
662 // 4 more v_cvt_* insts without f16 insts support
663 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
664 1 * getQuarterRateInstrCost(CostKind);
665
666 if (!HasFP32Denormals) {
667 // FP mode switches.
668 Cost += 2 * getFullRateInstrCost();
669 }
670
671 return LT.first * NElts * Cost;
672 }
673 break;
674 case ISD::FNEG:
675 // Use the backend' estimation. If fneg is not free each element will cost
676 // one additional instruction.
677 return TLI->isFNegFree(SLT) ? 0 : NElts;
678 default:
679 break;
680 }
681
682 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
683 Args, CxtI);
684}
685
686// Return true if there's a potential benefit from using v2f16/v2i16
687// instructions for an intrinsic, even if it requires nontrivial legalization.
689 switch (ID) {
690 case Intrinsic::fma:
691 case Intrinsic::fmuladd:
692 case Intrinsic::copysign:
693 case Intrinsic::minimumnum:
694 case Intrinsic::maximumnum:
695 case Intrinsic::canonicalize:
696 // There's a small benefit to using vector ops in the legalized code.
697 case Intrinsic::round:
698 case Intrinsic::uadd_sat:
699 case Intrinsic::usub_sat:
700 case Intrinsic::sadd_sat:
701 case Intrinsic::ssub_sat:
702 case Intrinsic::abs:
703 return true;
704 default:
705 return false;
706 }
707}
708
712 switch (ICA.getID()) {
713 case Intrinsic::fabs:
714 // Free source modifier in the common case.
715 return 0;
716 case Intrinsic::amdgcn_workitem_id_x:
717 case Intrinsic::amdgcn_workitem_id_y:
718 case Intrinsic::amdgcn_workitem_id_z:
719 // TODO: If hasPackedTID, or if the calling context is not an entry point
720 // there may be a bit instruction.
721 return 0;
722 case Intrinsic::amdgcn_workgroup_id_x:
723 case Intrinsic::amdgcn_workgroup_id_y:
724 case Intrinsic::amdgcn_workgroup_id_z:
725 case Intrinsic::amdgcn_lds_kernel_id:
726 case Intrinsic::amdgcn_dispatch_ptr:
727 case Intrinsic::amdgcn_dispatch_id:
728 case Intrinsic::amdgcn_implicitarg_ptr:
729 case Intrinsic::amdgcn_queue_ptr:
730 // Read from an argument register.
731 return 0;
732 default:
733 break;
734 }
735
738
739 Type *RetTy = ICA.getReturnType();
740
741 // Legalize the type.
742 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
743
744 unsigned NElts = LT.second.isVector() ?
745 LT.second.getVectorNumElements() : 1;
746
747 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
748
749 if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
750 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
751 NElts = (NElts + 1) / 2;
752
753 // TODO: Get more refined intrinsic costs?
754 unsigned InstRate = getQuarterRateInstrCost(CostKind);
755
756 switch (ICA.getID()) {
757 case Intrinsic::fma:
758 case Intrinsic::fmuladd:
759 if (SLT == MVT::f64) {
760 InstRate = get64BitInstrCost(CostKind);
761 break;
762 }
763
764 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
765 InstRate = getFullRateInstrCost();
766 else {
767 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
768 : getQuarterRateInstrCost(CostKind);
769 }
770 break;
771 case Intrinsic::copysign:
772 return NElts * getFullRateInstrCost();
773 case Intrinsic::minimumnum:
774 case Intrinsic::maximumnum: {
775 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
776 // promotion takes the place of the canonicalize.
777 unsigned NumOps = 3;
778 if (const IntrinsicInst *II = ICA.getInst()) {
779 // Directly legal with ieee=0
780 // TODO: Not directly legal with strictfp
782 NumOps = 1;
783 }
784
785 unsigned BaseRate =
786 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
787 InstRate = BaseRate * NumOps;
788 break;
789 }
790 case Intrinsic::canonicalize: {
791 InstRate =
792 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
793 break;
794 }
795 case Intrinsic::uadd_sat:
796 case Intrinsic::usub_sat:
797 case Intrinsic::sadd_sat:
798 case Intrinsic::ssub_sat: {
799 if (SLT == MVT::i16 || SLT == MVT::i32)
800 InstRate = getFullRateInstrCost();
801
802 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
803 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
804 NElts = 1;
805 break;
806 }
807 case Intrinsic::abs:
808 // Expansion takes 2 instructions for VALU
809 if (SLT == MVT::i16 || SLT == MVT::i32)
810 InstRate = 2 * getFullRateInstrCost();
811 break;
812 default:
813 break;
814 }
815
816 return LT.first * NElts * InstRate;
817}
818
821 const Instruction *I) const {
822 assert((I == nullptr || I->getOpcode() == Opcode) &&
823 "Opcode should reflect passed instruction.");
824 const bool SCost =
826 const int CBrCost = SCost ? 5 : 7;
827 switch (Opcode) {
828 case Instruction::Br: {
829 // Branch instruction takes about 4 slots on gfx900.
830 const auto *BI = dyn_cast_or_null<BranchInst>(I);
831 if (BI && BI->isUnconditional())
832 return SCost ? 1 : 4;
833 // Suppose conditional branch takes additional 3 exec manipulations
834 // instructions in average.
835 return CBrCost;
836 }
837 case Instruction::Switch: {
838 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
839 // Each case (including default) takes 1 cmp + 1 cbr instructions in
840 // average.
841 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
842 }
843 case Instruction::Ret:
844 return SCost ? 1 : 10;
845 }
846 return BaseT::getCFInstrCost(Opcode, CostKind, I);
847}
848
851 std::optional<FastMathFlags> FMF,
854 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
855
856 EVT OrigTy = TLI->getValueType(DL, Ty);
857
858 // Computes cost on targets that have packed math instructions(which support
859 // 16-bit types only).
860 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
861 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
862
863 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
864 return LT.first * getFullRateInstrCost();
865}
866
869 FastMathFlags FMF,
871 EVT OrigTy = TLI->getValueType(DL, Ty);
872
873 // Computes cost on targets that have packed math instructions(which support
874 // 16-bit types only).
875 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
876 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
877
878 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
879 return LT.first * getHalfRateInstrCost(CostKind);
880}
881
884 unsigned Index, const Value *Op0,
885 const Value *Op1) const {
886 switch (Opcode) {
887 case Instruction::ExtractElement:
888 case Instruction::InsertElement: {
889 unsigned EltSize
890 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
891 if (EltSize < 32) {
892 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
893 return 0;
894 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
895 Op1);
896 }
897
898 // Extracts are just reads of a subregister, so are free. Inserts are
899 // considered free because we don't want to have any cost for scalarizing
900 // operations, and we don't have to copy into a different register class.
901
902 // Dynamic indexing isn't free and is best avoided.
903 return Index == ~0u ? 2 : 0;
904 }
905 default:
906 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
907 }
908}
909
910/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
911/// this is analyzing the collective result of all output registers. Otherwise,
912/// this is only querying a specific result index if this returns multiple
913/// registers in a struct.
915 const CallInst *CI, ArrayRef<unsigned> Indices) const {
916 // TODO: Handle complex extract indices
917 if (Indices.size() > 1)
918 return true;
919
920 const DataLayout &DL = CI->getDataLayout();
921 const SIRegisterInfo *TRI = ST->getRegisterInfo();
922 TargetLowering::AsmOperandInfoVector TargetConstraints =
923 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
924
925 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
926
927 int OutputIdx = 0;
928 for (auto &TC : TargetConstraints) {
929 if (TC.Type != InlineAsm::isOutput)
930 continue;
931
932 // Skip outputs we don't care about.
933 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
934 continue;
935
937
939 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
940
941 // For AGPR constraints null is returned on subtargets without AGPRs, so
942 // assume divergent for null.
943 if (!RC || !TRI->isSGPRClass(RC))
944 return true;
945 }
946
947 return false;
948}
949
951 const IntrinsicInst *ReadReg) const {
952 Metadata *MD =
953 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
955 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
956
957 // Special case registers that look like VCC.
958 MVT VT = MVT::getVT(ReadReg->getType());
959 if (VT == MVT::i1)
960 return true;
961
962 // Special case scalar registers that start with 'v'.
963 if (RegName.starts_with("vcc") || RegName.empty())
964 return false;
965
966 // VGPR or AGPR is divergent. There aren't any specially named vector
967 // registers.
968 return RegName[0] == 'v' || RegName[0] == 'a';
969}
970
971/// \returns true if the result of the value could potentially be
972/// different across workitems in a wavefront.
974 if (const Argument *A = dyn_cast<Argument>(V))
976
977 // Loads from the private and flat address spaces are divergent, because
978 // threads can execute the load instruction with the same inputs and get
979 // different results.
980 //
981 // All other loads are not divergent, because if threads issue loads with the
982 // same arguments, they will always get the same result.
983 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
984 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
985 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
986
987 // Atomics are divergent because they are executed sequentially: when an
988 // atomic operation refers to the same address in each thread, then each
989 // thread after the first sees the value written by the previous thread as
990 // original value.
991 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(V))
992 return true;
993
994 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
995 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
996 switch (IID) {
997 case Intrinsic::read_register:
998 return isReadRegisterSourceOfDivergence(Intrinsic);
999 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1000 unsigned SrcAS =
1001 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1002 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1003 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1004 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1006 }
1007 case Intrinsic::amdgcn_workitem_id_y:
1008 case Intrinsic::amdgcn_workitem_id_z: {
1009 const Function *F = Intrinsic->getFunction();
1010 bool HasUniformYZ =
1011 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1012 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1013 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1014 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1015 }
1016 default:
1018 }
1019 }
1020
1021 // Assume all function calls are a source of divergence.
1022 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1023 if (CI->isInlineAsm())
1025 return true;
1026 }
1027
1028 // Assume all function calls are a source of divergence.
1029 if (isa<InvokeInst>(V))
1030 return true;
1031
1032 // If the target supports globally addressable scratch, the mapping from
1033 // scratch memory to the flat aperture changes therefore an address space cast
1034 // is no longer uniform.
1035 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1036 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1037 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1039 }
1040
1041 return false;
1042}
1043
1045 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1046 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1047
1048 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1049 if (CI->isInlineAsm())
1051 return false;
1052 }
1053
1054 // In most cases TID / wavefrontsize is uniform.
1055 //
1056 // However, if a kernel has uneven dimesions we can have a value of
1057 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1058 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1059 // packed into a same wave which gives 1 and 0 after the division by 64
1060 // respectively.
1061 //
1062 // The X dimension doesn't reset within a wave if either both the Y
1063 // and Z dimensions are of length 1, or if the X dimension's required
1064 // size is a power of 2. Note, however, if the X dimension's maximum
1065 // size is a power of 2 < the wavefront size, division by the wavefront
1066 // size is guaranteed to yield 0, so this is also a no-reset case.
1067 bool XDimDoesntResetWithinWaves = false;
1068 if (auto *I = dyn_cast<Instruction>(V)) {
1069 const Function *F = I->getFunction();
1070 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1071 }
1072 using namespace llvm::PatternMatch;
1073 uint64_t C;
1074 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1075 m_ConstantInt(C))) ||
1076 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1077 m_ConstantInt(C)))) {
1078 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1079 }
1080
1081 Value *Mask;
1082 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1083 m_Value(Mask)))) {
1084 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1085 ST->getWavefrontSizeLog2() &&
1086 XDimDoesntResetWithinWaves;
1087 }
1088
1089 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1090 if (!ExtValue)
1091 return false;
1092
1093 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1094 if (!CI)
1095 return false;
1096
1097 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1098 switch (Intrinsic->getIntrinsicID()) {
1099 default:
1100 return false;
1101 case Intrinsic::amdgcn_if:
1102 case Intrinsic::amdgcn_else: {
1103 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1104 return Indices.size() == 1 && Indices[0] == 1;
1105 }
1106 }
1107 }
1108
1109 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1110 // divergent for the overall struct return. We need to override it in the
1111 // case we're extracting an SGPR component here.
1112 if (CI->isInlineAsm())
1113 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1114
1115 return false;
1116}
1117
1119 Intrinsic::ID IID) const {
1120 switch (IID) {
1121 case Intrinsic::amdgcn_is_shared:
1122 case Intrinsic::amdgcn_is_private:
1123 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1124 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1125 case Intrinsic::amdgcn_load_to_lds:
1126 case Intrinsic::amdgcn_make_buffer_rsrc:
1127 OpIndexes.push_back(0);
1128 return true;
1129 default:
1130 return false;
1131 }
1132}
1133
1135 Value *OldV,
1136 Value *NewV) const {
1137 auto IntrID = II->getIntrinsicID();
1138 switch (IntrID) {
1139 case Intrinsic::amdgcn_is_shared:
1140 case Intrinsic::amdgcn_is_private: {
1141 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1143 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1144 LLVMContext &Ctx = NewV->getType()->getContext();
1145 ConstantInt *NewVal = (TrueAS == NewAS) ?
1147 return NewVal;
1148 }
1149 case Intrinsic::ptrmask: {
1150 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1151 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1152 Value *MaskOp = II->getArgOperand(1);
1153 Type *MaskTy = MaskOp->getType();
1154
1155 bool DoTruncate = false;
1156
1157 const GCNTargetMachine &TM =
1158 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1159 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1160 // All valid 64-bit to 32-bit casts work by chopping off the high
1161 // bits. Any masking only clearing the low bits will also apply in the new
1162 // address space.
1163 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1164 DL.getPointerSizeInBits(NewAS) != 32)
1165 return nullptr;
1166
1167 // TODO: Do we need to thread more context in here?
1168 KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);
1169 if (Known.countMinLeadingOnes() < 32)
1170 return nullptr;
1171
1172 DoTruncate = true;
1173 }
1174
1175 IRBuilder<> B(II);
1176 if (DoTruncate) {
1177 MaskTy = B.getInt32Ty();
1178 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1179 }
1180
1181 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1182 {NewV, MaskOp});
1183 }
1184 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1185 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1186 Type *DestTy = II->getType();
1187 Type *SrcTy = NewV->getType();
1188 unsigned NewAS = SrcTy->getPointerAddressSpace();
1190 return nullptr;
1191 Module *M = II->getModule();
1193 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1194 II->setArgOperand(0, NewV);
1195 II->setCalledFunction(NewDecl);
1196 return II;
1197 }
1198 case Intrinsic::amdgcn_load_to_lds: {
1199 Type *SrcTy = NewV->getType();
1200 Module *M = II->getModule();
1201 Function *NewDecl =
1202 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1203 II->setArgOperand(0, NewV);
1204 II->setCalledFunction(NewDecl);
1205 return II;
1206 }
1207 case Intrinsic::amdgcn_make_buffer_rsrc: {
1208 Type *SrcTy = NewV->getType();
1209 Type *DstTy = II->getType();
1210 Module *M = II->getModule();
1212 M, II->getIntrinsicID(), {DstTy, SrcTy});
1213 II->setArgOperand(0, NewV);
1214 II->setCalledFunction(NewDecl);
1215 return II;
1216 }
1217 default:
1218 return nullptr;
1219 }
1220}
1221
1223 VectorType *DstTy, VectorType *SrcTy,
1224 ArrayRef<int> Mask,
1226 int Index, VectorType *SubTp,
1228 const Instruction *CxtI) const {
1229 if (!isa<FixedVectorType>(SrcTy))
1230 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1231 SubTp);
1232
1233 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1234
1235 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1237 (ScalarSize == 16 || ScalarSize == 8)) {
1238 // Larger vector widths may require additional instructions, but are
1239 // typically cheaper than scalarized versions.
1240 unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1241 unsigned RequestedElts =
1242 count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1243 unsigned EltsPerReg = 32 / ScalarSize;
1244 if (RequestedElts == 0)
1245 return 0;
1246 switch (Kind) {
1247 case TTI::SK_Broadcast:
1248 case TTI::SK_Reverse:
1250 // With op_sel VOP3P instructions freely can access the low half or high
1251 // half of a register, so any swizzle of two elements is free.
1252 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
1253 return 0;
1254 unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1255 // SK_Broadcast just reuses the same mask
1256 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1257 return NumPerms + NumPermMasks;
1258 }
1261 // Even aligned accesses are free
1262 if (!(Index % 2))
1263 return 0;
1264 // Insert/extract subvectors only require shifts / extract code to get the
1265 // relevant bits
1266 return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1267 }
1269 case TTI::SK_Splice:
1270 case TTI::SK_Select: {
1271 unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1272 // SK_Select just reuses the same mask
1273 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1274 return NumPerms + NumPermMasks;
1275 }
1276
1277 default:
1278 break;
1279 }
1280 }
1281
1282 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1283 SubTp);
1284}
1285
1286/// Whether it is profitable to sink the operands of an
1287/// Instruction I to the basic block of I.
1288/// This helps using several modifiers (like abs and neg) more often.
1290 SmallVectorImpl<Use *> &Ops) const {
1291 using namespace PatternMatch;
1292
1293 for (auto &Op : I->operands()) {
1294 // Ensure we are not already sinking this operand.
1295 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1296 continue;
1297
1298 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
1299 Ops.push_back(&Op);
1300 }
1301
1302 return !Ops.empty();
1303}
1304
1306 const Function *Callee) const {
1307 const TargetMachine &TM = getTLI()->getTargetMachine();
1308 const GCNSubtarget *CallerST
1309 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1310 const GCNSubtarget *CalleeST
1311 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1312
1313 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1314 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1315
1316 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1317 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1318 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1319 return false;
1320
1321 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1322 // no way to support merge for backend defined attributes.
1323 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1324 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1325 if (!CallerMode.isInlineCompatible(CalleeMode))
1326 return false;
1327
1328 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1329 Callee->hasFnAttribute(Attribute::InlineHint))
1330 return true;
1331
1332 // Hack to make compile times reasonable.
1333 if (InlineMaxBB) {
1334 // Single BB does not increase total BB amount.
1335 if (Callee->size() == 1)
1336 return true;
1337 size_t BBSize = Caller->size() + Callee->size() - 1;
1338 return BBSize <= InlineMaxBB;
1339 }
1340
1341 return true;
1342}
1343
1345 const SITargetLowering *TLI,
1346 const GCNTTIImpl *TTIImpl) {
1347 const int NrOfSGPRUntilSpill = 26;
1348 const int NrOfVGPRUntilSpill = 32;
1349
1350 const DataLayout &DL = TTIImpl->getDataLayout();
1351
1352 unsigned adjustThreshold = 0;
1353 int SGPRsInUse = 0;
1354 int VGPRsInUse = 0;
1355 for (const Use &A : CB->args()) {
1356 SmallVector<EVT, 4> ValueVTs;
1357 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1358 for (auto ArgVT : ValueVTs) {
1359 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1360 CB->getContext(), CB->getCallingConv(), ArgVT);
1362 SGPRsInUse += CCRegNum;
1363 else
1364 VGPRsInUse += CCRegNum;
1365 }
1366 }
1367
1368 // The cost of passing function arguments through the stack:
1369 // 1 instruction to put a function argument on the stack in the caller.
1370 // 1 instruction to take a function argument from the stack in callee.
1371 // 1 instruction is explicitly take care of data dependencies in callee
1372 // function.
1373 InstructionCost ArgStackCost(1);
1374 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1375 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1377 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1378 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1380
1381 // The penalty cost is computed relative to the cost of instructions and does
1382 // not model any storage costs.
1383 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1384 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1385 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1386 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1387 return adjustThreshold;
1388}
1389
1390static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1391 const DataLayout &DL) {
1392 // If we have a pointer to a private array passed into a function
1393 // it will not be optimized out, leaving scratch usage.
1394 // This function calculates the total size in bytes of the memory that would
1395 // end in scratch if the call was not inlined.
1396 unsigned AllocaSize = 0;
1398 for (Value *PtrArg : CB->args()) {
1399 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1400 if (!Ty)
1401 continue;
1402
1403 unsigned AddrSpace = Ty->getAddressSpace();
1404 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1405 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1406 continue;
1407
1408 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1409 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1410 continue;
1411
1412 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1413 }
1414 return AllocaSize;
1415}
1416
1420}
1421
1423 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1424
1425 // Private object passed as arguments may end up in scratch usage if the call
1426 // is not inlined. Increase the inline threshold to promote inlining.
1427 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1428 if (AllocaSize > 0)
1429 Threshold += ArgAllocaCost;
1430 return Threshold;
1431}
1432
1434 const AllocaInst *AI) const {
1435
1436 // Below the cutoff, assume that the private memory objects would be
1437 // optimized
1438 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1439 if (AllocaSize <= ArgAllocaCutoff)
1440 return 0;
1441
1442 // Above the cutoff, we give a cost to each private memory object
1443 // depending its size. If the array can be optimized by SROA this cost is not
1444 // added to the total-cost in the inliner cost analysis.
1445 //
1446 // We choose the total cost of the alloca such that their sum cancels the
1447 // bonus given in the threshold (ArgAllocaCost).
1448 //
1449 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1450 //
1451 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1452 // the single-bb bonus and the vector-bonus.
1453 //
1454 // We compensate the first two multipliers, by repeating logic from the
1455 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1456 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1457 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1458
1459 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1460 return BB.getTerminator()->getNumSuccessors() > 1;
1461 });
1462 if (SingleBB) {
1463 Threshold += Threshold / 2;
1464 }
1465
1466 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1467
1468 // Attribute the bonus proportionally to the alloca size
1469 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1470
1471 return AllocaThresholdBonus;
1472}
1473
1476 OptimizationRemarkEmitter *ORE) const {
1477 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1478}
1479
1481 TTI::PeelingPreferences &PP) const {
1482 CommonTTI.getPeelingPreferences(L, SE, PP);
1483}
1484
1485int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1486 return ST->hasFullRate64Ops()
1487 ? getFullRateInstrCost()
1488 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1489 : getQuarterRateInstrCost(CostKind);
1490}
1491
1492std::pair<InstructionCost, MVT>
1493GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1494 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1495 auto Size = DL.getTypeSizeInBits(Ty);
1496 // Maximum load or store can handle 8 dwords for scalar and 4 for
1497 // vector ALU. Let's assume anything above 8 dwords is expensive
1498 // even if legal.
1499 if (Size <= 256)
1500 return Cost;
1501
1502 Cost.first += (Size + 255) / 256;
1503 return Cost;
1504}
1505
1507 return ST->hasPrefetch() ? 128 : 0;
1508}
1509
1512}
1513
1515 const Function &F,
1516 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1517 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1518 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1519 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1520 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1521 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1523 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1524 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1525 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1526 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1527 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1528}
1529
1532 if (!ST->hasIEEEMode()) // Only mode on gfx12
1533 return KnownIEEEMode::On;
1534
1535 const Function *F = I.getFunction();
1536 if (!F)
1538
1539 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1540 if (IEEEAttr.isValid())
1542
1543 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1545}
1546
1548 Align Alignment,
1549 unsigned AddressSpace,
1551 TTI::OperandValueInfo OpInfo,
1552 const Instruction *I) const {
1553 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1554 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1555 VecTy->getElementType()->isIntegerTy(8)) {
1556 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1558 }
1559 }
1560 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1561 OpInfo, I);
1562}
1563
1565 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1566 if (VecTy->getElementType()->isIntegerTy(8)) {
1567 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1568 return divideCeil(ElementCount - 1, 4);
1569 }
1570 }
1571 return BaseT::getNumberOfParts(Tp);
1572}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
return RetTy
uint64_t Size
Hexagon Common GEP
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition: Debug.h:119
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasMadMacF32Insts() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
Definition: Instructions.h:64
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:121
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:386
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:223
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1415
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1406
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1323
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:169
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
Definition: DataLayout.h:390
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
bool hasPrefetch() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:522
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
bool hasGloballyAddressableScratch() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:420
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:638
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:383
bool hasIEEEMode() const
Generation getGeneration() const
Definition: GCNSubtarget.h:356
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool isAlwaysUniform(const Value *V) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool isSourceOfDivergence(const Value *V) const override
int getInliningLastCallToStaticBonus() const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Metadata node.
Definition: Metadata.h:1077
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
The optimization diagnostic interface.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TargetOptions Options
virtual int getInliningLastCallToStaticBonus() const
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
LLVM_ABI int getInstrCost()
Definition: InlineCost.cpp:206
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:477
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1079
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ FAdd
Sum of floats.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
InstructionCost Cost
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:244
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...