LLVM 22.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "InstCombineTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(Src1, Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(Src0, Src2);
59
60 return maxnum(Src0, Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
79 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
80 &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
94 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
106 Type *VTy = V.getType();
107 if (isa<FPExtInst, SExtInst, ZExtInst>(&V))
108 return cast<Instruction>(&V)->getOperand(0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Ext, Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
282 Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
288
289 II.mutateType(HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
299 HalfExtract->takeName(Tr);
300
301 Tr->replaceAllUsesWith(HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(*Tr);
306 IC.eraseInstFromFunction(*Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
362 : Type::getInt16Ty(II.getContext());
363
364 return modifyIntrinsicCall(
365 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
387 }
388 });
389}
390
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
406 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(Arg, m_ConstantFP(CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
424 if (!LosesInfo)
425 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(UseV, i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
459 auto *VTy = cast<FixedVectorType>(V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(VWidth);
462 Value *FirstComponent = findScalarElement(V, 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
466 SVI->getShuffleMask(ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(I);
480 }
481
482 return DemandedElts;
483}
484
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(V))
505 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
547 Function &NewCallee, ArrayRef<Value *> Ops) {
549 Old.getOperandBundlesAsDefs(OpBundles);
550
551 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
552 NewCall->takeName(&Old);
553 return NewCall;
554}
555
558 IntrinsicInst &II) const {
559 const auto IID = II.getIntrinsicID();
560 assert(IID == Intrinsic::amdgcn_readlane ||
561 IID == Intrinsic::amdgcn_readfirstlane ||
562 IID == Intrinsic::amdgcn_permlane64);
563
564 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
565
566 // Only do this if both instructions are in the same block
567 // (so the exec mask won't change) and the readlane is the only user of its
568 // operand.
569 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
570 return nullptr;
571
572 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
573
574 // If this is a readlane, check that the second operand is a constant, or is
575 // defined before OpInst so we know it's safe to move this intrinsic higher.
576 Value *LaneID = nullptr;
577 if (IsReadLane) {
578 LaneID = II.getOperand(1);
579
580 // readlane take an extra operand for the lane ID, so we must check if that
581 // LaneID value can be used at the point where we want to move the
582 // intrinsic.
583 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
584 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
585 return nullptr;
586 }
587 }
588
589 // Hoist the intrinsic (II) through OpInst.
590 //
591 // (II (OpInst x)) -> (OpInst (II x))
592 const auto DoIt = [&](unsigned OpIdx,
593 Function *NewIntrinsic) -> Instruction * {
595 if (IsReadLane)
596 Ops.push_back(LaneID);
597
598 // Rewrite the intrinsic call.
599 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
600
601 // Rewrite OpInst so it takes the result of the intrinsic now.
602 Instruction &NewOp = *OpInst->clone();
603 NewOp.setOperand(OpIdx, NewII);
604 return &NewOp;
605 };
606
607 // TODO(?): Should we do more with permlane64?
608 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
609 return nullptr;
610
611 if (isa<UnaryOperator>(OpInst))
612 return DoIt(0, II.getCalledFunction());
613
614 if (isa<CastInst>(OpInst)) {
615 Value *Src = OpInst->getOperand(0);
616 Type *SrcTy = Src->getType();
617 if (!isTypeLegal(SrcTy))
618 return nullptr;
619
620 Function *Remangled =
621 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
622 return DoIt(0, Remangled);
623 }
624
625 // We can also hoist through binary operators if the other operand is uniform.
626 if (isa<BinaryOperator>(OpInst)) {
627 // FIXME: If we had access to UniformityInfo here we could just check
628 // if the operand is uniform.
629 if (isTriviallyUniform(OpInst->getOperandUse(0)))
630 return DoIt(1, II.getCalledFunction());
631 if (isTriviallyUniform(OpInst->getOperandUse(1)))
632 return DoIt(0, II.getCalledFunction());
633 }
634
635 return nullptr;
636}
637
638std::optional<Instruction *>
640 Intrinsic::ID IID = II.getIntrinsicID();
641 switch (IID) {
642 case Intrinsic::amdgcn_rcp: {
643 Value *Src = II.getArgOperand(0);
644 if (isa<PoisonValue>(Src))
645 return IC.replaceInstUsesWith(II, Src);
646
647 // TODO: Move to ConstantFolding/InstSimplify?
648 if (isa<UndefValue>(Src)) {
649 Type *Ty = II.getType();
650 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
651 return IC.replaceInstUsesWith(II, QNaN);
652 }
653
654 if (II.isStrictFP())
655 break;
656
657 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
658 const APFloat &ArgVal = C->getValueAPF();
659 APFloat Val(ArgVal.getSemantics(), 1);
661
662 // This is more precise than the instruction may give.
663 //
664 // TODO: The instruction always flushes denormal results (except for f16),
665 // should this also?
666 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
667 }
668
669 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
670 if (!FMF.allowContract())
671 break;
672 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
673 if (!SrcCI)
674 break;
675
676 auto IID = SrcCI->getIntrinsicID();
677 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
678 //
679 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
680 // relaxed.
681 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
682 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
683 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
684 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
685 break;
686
687 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
688 break;
689
691 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
692
693 InnerFMF |= FMF;
694 II.setFastMathFlags(InnerFMF);
695
696 II.setCalledFunction(NewDecl);
697 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
698 }
699
700 break;
701 }
702 case Intrinsic::amdgcn_sqrt:
703 case Intrinsic::amdgcn_rsq:
704 case Intrinsic::amdgcn_tanh: {
705 Value *Src = II.getArgOperand(0);
706 if (isa<PoisonValue>(Src))
707 return IC.replaceInstUsesWith(II, Src);
708
709 // TODO: Move to ConstantFolding/InstSimplify?
710 if (isa<UndefValue>(Src)) {
711 Type *Ty = II.getType();
712 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
713 return IC.replaceInstUsesWith(II, QNaN);
714 }
715
716 // f16 amdgcn.sqrt is identical to regular sqrt.
717 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
719 II.getModule(), Intrinsic::sqrt, {II.getType()});
720 II.setCalledFunction(NewDecl);
721 return &II;
722 }
723
724 break;
725 }
726 case Intrinsic::amdgcn_log:
727 case Intrinsic::amdgcn_exp2: {
728 const bool IsLog = IID == Intrinsic::amdgcn_log;
729 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
730 Value *Src = II.getArgOperand(0);
731 Type *Ty = II.getType();
732
733 if (isa<PoisonValue>(Src))
734 return IC.replaceInstUsesWith(II, Src);
735
736 if (IC.getSimplifyQuery().isUndefValue(Src))
738
739 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
740 if (C->isInfinity()) {
741 // exp2(+inf) -> +inf
742 // log2(+inf) -> +inf
743 if (!C->isNegative())
744 return IC.replaceInstUsesWith(II, C);
745
746 // exp2(-inf) -> 0
747 if (IsExp && C->isNegative())
749 }
750
751 if (II.isStrictFP())
752 break;
753
754 if (C->isNaN()) {
755 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
756 return IC.replaceInstUsesWith(II, Quieted);
757 }
758
759 // f32 instruction doesn't handle denormals, f16 does.
760 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
761 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
762 : ConstantFP::get(Ty, 1.0);
763 return IC.replaceInstUsesWith(II, FoldedValue);
764 }
765
766 if (IsLog && C->isNegative())
768
769 // TODO: Full constant folding matching hardware behavior.
770 }
771
772 break;
773 }
774 case Intrinsic::amdgcn_frexp_mant:
775 case Intrinsic::amdgcn_frexp_exp: {
776 Value *Src = II.getArgOperand(0);
777 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
778 int Exp;
779 APFloat Significand =
780 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
781
782 if (IID == Intrinsic::amdgcn_frexp_mant) {
783 return IC.replaceInstUsesWith(
784 II, ConstantFP::get(II.getContext(), Significand));
785 }
786
787 // Match instruction special case behavior.
788 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
789 Exp = 0;
790
791 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
792 }
793
794 if (isa<PoisonValue>(Src))
795 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
796
797 if (isa<UndefValue>(Src)) {
798 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
799 }
800
801 break;
802 }
803 case Intrinsic::amdgcn_class: {
804 Value *Src0 = II.getArgOperand(0);
805 Value *Src1 = II.getArgOperand(1);
806 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
807 if (CMask) {
808 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
809 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
810
811 // Clamp any excess bits, as they're illegal for the generic intrinsic.
812 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
813 CMask->getZExtValue() & fcAllFlags));
814 return &II;
815 }
816
817 // Propagate poison.
818 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
819 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
820
821 // llvm.amdgcn.class(_, undef) -> false
822 if (IC.getSimplifyQuery().isUndefValue(Src1))
823 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
824
825 // llvm.amdgcn.class(undef, mask) -> mask != 0
826 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
827 Value *CmpMask = IC.Builder.CreateICmpNE(
828 Src1, ConstantInt::getNullValue(Src1->getType()));
829 return IC.replaceInstUsesWith(II, CmpMask);
830 }
831 break;
832 }
833 case Intrinsic::amdgcn_cvt_pkrtz: {
834 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
835 Type *HalfTy = Type::getHalfTy(Arg->getContext());
836
837 if (isa<PoisonValue>(Arg))
838 return PoisonValue::get(HalfTy);
839 if (isa<UndefValue>(Arg))
840 return UndefValue::get(HalfTy);
841
842 ConstantFP *CFP = nullptr;
843 if (match(Arg, m_ConstantFP(CFP))) {
844 bool LosesInfo;
845 APFloat Val(CFP->getValueAPF());
847 return ConstantFP::get(HalfTy, Val);
848 }
849
850 Value *Src = nullptr;
851 if (match(Arg, m_FPExt(m_Value(Src)))) {
852 if (Src->getType()->isHalfTy())
853 return Src;
854 }
855
856 return nullptr;
857 };
858
859 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
860 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
861 Value *V = PoisonValue::get(II.getType());
862 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
863 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
864 return IC.replaceInstUsesWith(II, V);
865 }
866 }
867
868 break;
869 }
870 case Intrinsic::amdgcn_cvt_pknorm_i16:
871 case Intrinsic::amdgcn_cvt_pknorm_u16:
872 case Intrinsic::amdgcn_cvt_pk_i16:
873 case Intrinsic::amdgcn_cvt_pk_u16: {
874 Value *Src0 = II.getArgOperand(0);
875 Value *Src1 = II.getArgOperand(1);
876
877 // TODO: Replace call with scalar operation if only one element is poison.
878 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
879 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
880
881 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
882 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
883 }
884
885 break;
886 }
887 case Intrinsic::amdgcn_cvt_off_f32_i4: {
888 Value* Arg = II.getArgOperand(0);
889 Type *Ty = II.getType();
890
891 if (isa<PoisonValue>(Arg))
893
894 if(IC.getSimplifyQuery().isUndefValue(Arg))
896
897 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
898 if (!CArg)
899 break;
900
901 // Tabulated 0.0625 * (sext (CArg & 0xf)).
902 constexpr size_t ResValsSize = 16;
903 static constexpr float ResVals[ResValsSize] = {
904 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
905 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
906 Constant *Res =
907 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
908 return IC.replaceInstUsesWith(II, Res);
909 }
910 case Intrinsic::amdgcn_ubfe:
911 case Intrinsic::amdgcn_sbfe: {
912 // Decompose simple cases into standard shifts.
913 Value *Src = II.getArgOperand(0);
914 if (isa<UndefValue>(Src)) {
915 return IC.replaceInstUsesWith(II, Src);
916 }
917
918 unsigned Width;
919 Type *Ty = II.getType();
920 unsigned IntSize = Ty->getIntegerBitWidth();
921
922 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
923 if (CWidth) {
924 Width = CWidth->getZExtValue();
925 if ((Width & (IntSize - 1)) == 0) {
927 }
928
929 // Hardware ignores high bits, so remove those.
930 if (Width >= IntSize) {
931 return IC.replaceOperand(
932 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
933 }
934 }
935
936 unsigned Offset;
937 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
938 if (COffset) {
939 Offset = COffset->getZExtValue();
940 if (Offset >= IntSize) {
941 return IC.replaceOperand(
942 II, 1,
943 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
944 }
945 }
946
947 bool Signed = IID == Intrinsic::amdgcn_sbfe;
948
949 if (!CWidth || !COffset)
950 break;
951
952 // The case of Width == 0 is handled above, which makes this transformation
953 // safe. If Width == 0, then the ashr and lshr instructions become poison
954 // value since the shift amount would be equal to the bit size.
955 assert(Width != 0);
956
957 // TODO: This allows folding to undef when the hardware has specific
958 // behavior?
959 if (Offset + Width < IntSize) {
960 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
961 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
962 : IC.Builder.CreateLShr(Shl, IntSize - Width);
963 RightShift->takeName(&II);
964 return IC.replaceInstUsesWith(II, RightShift);
965 }
966
967 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
968 : IC.Builder.CreateLShr(Src, Offset);
969
970 RightShift->takeName(&II);
971 return IC.replaceInstUsesWith(II, RightShift);
972 }
973 case Intrinsic::amdgcn_exp:
974 case Intrinsic::amdgcn_exp_row:
975 case Intrinsic::amdgcn_exp_compr: {
976 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
977 unsigned EnBits = En->getZExtValue();
978 if (EnBits == 0xf)
979 break; // All inputs enabled.
980
981 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
982 bool Changed = false;
983 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
984 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
985 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
986 Value *Src = II.getArgOperand(I + 2);
987 if (!isa<PoisonValue>(Src)) {
988 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
989 Changed = true;
990 }
991 }
992 }
993
994 if (Changed) {
995 return &II;
996 }
997
998 break;
999 }
1000 case Intrinsic::amdgcn_fmed3: {
1001 Value *Src0 = II.getArgOperand(0);
1002 Value *Src1 = II.getArgOperand(1);
1003 Value *Src2 = II.getArgOperand(2);
1004
1005 for (Value *Src : {Src0, Src1, Src2}) {
1006 if (isa<PoisonValue>(Src))
1007 return IC.replaceInstUsesWith(II, Src);
1008 }
1009
1010 if (II.isStrictFP())
1011 break;
1012
1013 // med3 with a nan input acts like
1014 // v_min_f32(v_min_f32(s0, s1), s2)
1015 //
1016 // Signalingness is ignored with ieee=0, so we fold to
1017 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1018 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1019 // returned signaling nan will not be quieted.
1020
1021 // ieee=1
1022 // s0 snan: s2
1023 // s1 snan: s2
1024 // s2 snan: qnan
1025
1026 // s0 qnan: min(s1, s2)
1027 // s1 qnan: min(s0, s2)
1028 // s2 qnan: min(s0, s1)
1029
1030 // ieee=0
1031 // s0 _nan: min(s1, s2)
1032 // s1 _nan: min(s0, s2)
1033 // s2 _nan: min(s0, s1)
1034
1035 // med3 behavior with infinity
1036 // s0 +inf: max(s1, s2)
1037 // s1 +inf: max(s0, s2)
1038 // s2 +inf: max(s0, s1)
1039 // s0 -inf: min(s1, s2)
1040 // s1 -inf: min(s0, s2)
1041 // s2 -inf: min(s0, s1)
1042
1043 // Checking for NaN before canonicalization provides better fidelity when
1044 // mapping other operations onto fmed3 since the order of operands is
1045 // unchanged.
1046 Value *V = nullptr;
1047 const APFloat *ConstSrc0 = nullptr;
1048 const APFloat *ConstSrc1 = nullptr;
1049 const APFloat *ConstSrc2 = nullptr;
1050
1051 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1052 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1053 isa<UndefValue>(Src0)) {
1054 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1055 switch (fpenvIEEEMode(II)) {
1056 case KnownIEEEMode::On:
1057 // TODO: If Src2 is snan, does it need quieting?
1058 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1059 return IC.replaceInstUsesWith(II, Src2);
1060
1061 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1062 : IC.Builder.CreateMinNum(Src1, Src2);
1063 break;
1064 case KnownIEEEMode::Off:
1065 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1066 : IC.Builder.CreateMinimumNum(Src1, Src2);
1067 break;
1069 break;
1070 }
1071 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1072 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1073 isa<UndefValue>(Src1)) {
1074 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1075 switch (fpenvIEEEMode(II)) {
1076 case KnownIEEEMode::On:
1077 // TODO: If Src2 is snan, does it need quieting?
1078 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1079 return IC.replaceInstUsesWith(II, Src2);
1080
1081 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1082 : IC.Builder.CreateMinNum(Src0, Src2);
1083 break;
1084 case KnownIEEEMode::Off:
1085 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1086 : IC.Builder.CreateMinimumNum(Src0, Src2);
1087 break;
1089 break;
1090 }
1091 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1092 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1093 isa<UndefValue>(Src2)) {
1094 switch (fpenvIEEEMode(II)) {
1095 case KnownIEEEMode::On:
1096 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1097 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1098 return IC.replaceInstUsesWith(II, Quieted);
1099 }
1100
1101 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1102 ? IC.Builder.CreateMaxNum(Src0, Src1)
1103 : IC.Builder.CreateMinNum(Src0, Src1);
1104 break;
1105 case KnownIEEEMode::Off:
1106 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1107 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1108 : IC.Builder.CreateMaximumNum(Src0, Src1);
1109 break;
1111 break;
1112 }
1113 }
1114
1115 if (V) {
1116 if (auto *CI = dyn_cast<CallInst>(V)) {
1117 CI->copyFastMathFlags(&II);
1118 CI->takeName(&II);
1119 }
1120 return IC.replaceInstUsesWith(II, V);
1121 }
1122
1123 bool Swap = false;
1124 // Canonicalize constants to RHS operands.
1125 //
1126 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1127 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1128 std::swap(Src0, Src1);
1129 Swap = true;
1130 }
1131
1132 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1133 std::swap(Src1, Src2);
1134 Swap = true;
1135 }
1136
1137 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1138 std::swap(Src0, Src1);
1139 Swap = true;
1140 }
1141
1142 if (Swap) {
1143 II.setArgOperand(0, Src0);
1144 II.setArgOperand(1, Src1);
1145 II.setArgOperand(2, Src2);
1146 return &II;
1147 }
1148
1149 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1150 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1151 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1152 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1153 C2->getValueAPF());
1154 return IC.replaceInstUsesWith(II,
1155 ConstantFP::get(II.getType(), Result));
1156 }
1157 }
1158 }
1159
1160 if (!ST->hasMed3_16())
1161 break;
1162
1163 // Repeat floating-point width reduction done for minnum/maxnum.
1164 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1165 if (Value *X = matchFPExtFromF16(Src0)) {
1166 if (Value *Y = matchFPExtFromF16(Src1)) {
1167 if (Value *Z = matchFPExtFromF16(Src2)) {
1168 Value *NewCall = IC.Builder.CreateIntrinsic(
1169 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1170 return new FPExtInst(NewCall, II.getType());
1171 }
1172 }
1173 }
1174
1175 break;
1176 }
1177 case Intrinsic::amdgcn_icmp:
1178 case Intrinsic::amdgcn_fcmp: {
1179 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1180 // Guard against invalid arguments.
1181 int64_t CCVal = CC->getZExtValue();
1182 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1183 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1184 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1185 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1187 break;
1188
1189 Value *Src0 = II.getArgOperand(0);
1190 Value *Src1 = II.getArgOperand(1);
1191
1192 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1193 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1195 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1196 if (CCmp && CCmp->isNullValue()) {
1197 return IC.replaceInstUsesWith(
1198 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1199 }
1200
1201 // The result of V_ICMP/V_FCMP assembly instructions (which this
1202 // intrinsic exposes) is one bit per thread, masked with the EXEC
1203 // register (which contains the bitmask of live threads). So a
1204 // comparison that always returns true is the same as a read of the
1205 // EXEC register.
1206 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1207 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1208 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1209 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1210 II.getType(), Args);
1211 NewCall->addFnAttr(Attribute::Convergent);
1212 NewCall->takeName(&II);
1213 return IC.replaceInstUsesWith(II, NewCall);
1214 }
1215
1216 // Canonicalize constants to RHS.
1217 CmpInst::Predicate SwapPred =
1219 II.setArgOperand(0, Src1);
1220 II.setArgOperand(1, Src0);
1221 II.setArgOperand(
1222 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1223 return &II;
1224 }
1225
1226 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1227 break;
1228
1229 // Canonicalize compare eq with true value to compare != 0
1230 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1231 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1232 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1233 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1234 Value *ExtSrc;
1235 if (CCVal == CmpInst::ICMP_EQ &&
1236 ((match(Src1, PatternMatch::m_One()) &&
1237 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1238 (match(Src1, PatternMatch::m_AllOnes()) &&
1239 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1240 ExtSrc->getType()->isIntegerTy(1)) {
1242 IC.replaceOperand(II, 2,
1243 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1244 return &II;
1245 }
1246
1247 CmpPredicate SrcPred;
1248 Value *SrcLHS;
1249 Value *SrcRHS;
1250
1251 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1252 // intrinsic. The typical use is a wave vote function in the library, which
1253 // will be fed from a user code condition compared with 0. Fold in the
1254 // redundant compare.
1255
1256 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1257 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1258 //
1259 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1260 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1261 if (match(Src1, PatternMatch::m_Zero()) &&
1263 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1264 PatternMatch::m_Value(SrcRHS))))) {
1265 if (CCVal == CmpInst::ICMP_EQ)
1266 SrcPred = CmpInst::getInversePredicate(SrcPred);
1267
1268 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1269 ? Intrinsic::amdgcn_fcmp
1270 : Intrinsic::amdgcn_icmp;
1271
1272 Type *Ty = SrcLHS->getType();
1273 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1274 // Promote to next legal integer type.
1275 unsigned Width = CmpType->getBitWidth();
1276 unsigned NewWidth = Width;
1277
1278 // Don't do anything for i1 comparisons.
1279 if (Width == 1)
1280 break;
1281
1282 if (Width <= 16)
1283 NewWidth = 16;
1284 else if (Width <= 32)
1285 NewWidth = 32;
1286 else if (Width <= 64)
1287 NewWidth = 64;
1288 else
1289 break; // Can't handle this.
1290
1291 if (Width != NewWidth) {
1292 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1293 if (CmpInst::isSigned(SrcPred)) {
1294 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1295 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1296 } else {
1297 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1298 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1299 }
1300 }
1301 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1302 break;
1303
1304 Value *Args[] = {SrcLHS, SrcRHS,
1305 ConstantInt::get(CC->getType(), SrcPred)};
1306 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1307 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1308 NewCall->takeName(&II);
1309 return IC.replaceInstUsesWith(II, NewCall);
1310 }
1311
1312 break;
1313 }
1314 case Intrinsic::amdgcn_mbcnt_hi: {
1315 // exec_hi is all 0, so this is just a copy.
1316 if (ST->isWave32())
1317 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1318 break;
1319 }
1320 case Intrinsic::amdgcn_ballot: {
1321 Value *Arg = II.getArgOperand(0);
1322 if (isa<PoisonValue>(Arg))
1323 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1324
1325 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1326 if (Src->isZero()) {
1327 // amdgcn.ballot(i1 0) is zero.
1328 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1329 }
1330 }
1331 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1332 // %b64 = call i64 ballot.i64(...)
1333 // =>
1334 // %b32 = call i32 ballot.i32(...)
1335 // %b64 = zext i32 %b32 to i64
1336 Value *Call = IC.Builder.CreateZExt(
1337 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1338 {IC.Builder.getInt32Ty()},
1339 {II.getArgOperand(0)}),
1340 II.getType());
1341 Call->takeName(&II);
1342 return IC.replaceInstUsesWith(II, Call);
1343 }
1344 break;
1345 }
1346 case Intrinsic::amdgcn_wavefrontsize: {
1347 if (ST->isWaveSizeKnown())
1348 return IC.replaceInstUsesWith(
1349 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1350 break;
1351 }
1352 case Intrinsic::amdgcn_wqm_vote: {
1353 // wqm_vote is identity when the argument is constant.
1354 if (!isa<Constant>(II.getArgOperand(0)))
1355 break;
1356
1357 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1358 }
1359 case Intrinsic::amdgcn_kill: {
1360 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1361 if (!C || !C->getZExtValue())
1362 break;
1363
1364 // amdgcn.kill(i1 1) is a no-op
1365 return IC.eraseInstFromFunction(II);
1366 }
1367 case Intrinsic::amdgcn_update_dpp: {
1368 Value *Old = II.getArgOperand(0);
1369
1370 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1371 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1372 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1373 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1374 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1375 break;
1376
1377 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1378 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1379 }
1380 case Intrinsic::amdgcn_permlane16:
1381 case Intrinsic::amdgcn_permlane16_var:
1382 case Intrinsic::amdgcn_permlanex16:
1383 case Intrinsic::amdgcn_permlanex16_var: {
1384 // Discard vdst_in if it's not going to be read.
1385 Value *VDstIn = II.getArgOperand(0);
1386 if (isa<PoisonValue>(VDstIn))
1387 break;
1388
1389 // FetchInvalid operand idx.
1390 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1391 IID == Intrinsic::amdgcn_permlanex16)
1392 ? 4 /* for permlane16 and permlanex16 */
1393 : 3; /* for permlane16_var and permlanex16_var */
1394
1395 // BoundCtrl operand idx.
1396 // For permlane16 and permlanex16 it should be 5
1397 // For Permlane16_var and permlanex16_var it should be 4
1398 unsigned int BcIdx = FiIdx + 1;
1399
1400 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1401 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1402 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1403 break;
1404
1405 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1406 }
1407 case Intrinsic::amdgcn_permlane64:
1408 case Intrinsic::amdgcn_readfirstlane:
1409 case Intrinsic::amdgcn_readlane:
1410 case Intrinsic::amdgcn_ds_bpermute: {
1411 // If the data argument is uniform these intrinsics return it unchanged.
1412 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1413 const Use &Src = II.getArgOperandUse(SrcIdx);
1414 if (isTriviallyUniform(Src))
1415 return IC.replaceInstUsesWith(II, Src.get());
1416
1417 if (IID == Intrinsic::amdgcn_readlane &&
1419 return &II;
1420
1421 // If the lane argument of bpermute is uniform, change it to readlane. This
1422 // generates better code and can enable further optimizations because
1423 // readlane is AlwaysUniform.
1424 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1425 const Use &Lane = II.getArgOperandUse(0);
1426 if (isTriviallyUniform(Lane)) {
1427 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1429 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1430 II.setCalledFunction(NewDecl);
1431 II.setOperand(0, Src);
1432 II.setOperand(1, NewLane);
1433 return &II;
1434 }
1435 }
1436
1437 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1439 return Res;
1440 }
1441
1442 return std::nullopt;
1443 }
1444 case Intrinsic::amdgcn_writelane: {
1445 // TODO: Fold bitcast like readlane.
1446 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1447 return &II;
1448 return std::nullopt;
1449 }
1450 case Intrinsic::amdgcn_trig_preop: {
1451 // The intrinsic is declared with name mangling, but currently the
1452 // instruction only exists for f64
1453 if (!II.getType()->isDoubleTy())
1454 break;
1455
1456 Value *Src = II.getArgOperand(0);
1457 Value *Segment = II.getArgOperand(1);
1458 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1459 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1460
1461 if (isa<UndefValue>(Src)) {
1462 auto *QNaN = ConstantFP::get(
1463 II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1464 return IC.replaceInstUsesWith(II, QNaN);
1465 }
1466
1467 const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1468 if (!Csrc)
1469 break;
1470
1471 if (II.isStrictFP())
1472 break;
1473
1474 const APFloat &Fsrc = Csrc->getValueAPF();
1475 if (Fsrc.isNaN()) {
1476 auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1477 return IC.replaceInstUsesWith(II, Quieted);
1478 }
1479
1480 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1481 if (!Cseg)
1482 break;
1483
1484 unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1485 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1486 unsigned Shift = SegmentVal * 53;
1487 if (Exponent > 1077)
1488 Shift += Exponent - 1077;
1489
1490 // 2.0/PI table.
1491 static const uint32_t TwoByPi[] = {
1492 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1493 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1494 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1495 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1496 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1497 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1498 0x56033046};
1499
1500 // Return 0 for outbound segment (hardware behavior).
1501 unsigned Idx = Shift >> 5;
1502 if (Idx + 2 >= std::size(TwoByPi)) {
1503 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1504 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1505 }
1506
1507 unsigned BShift = Shift & 0x1f;
1508 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1509 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1510 if (BShift)
1511 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1512 Thi = Thi >> 11;
1513 APFloat Result = APFloat((double)Thi);
1514
1515 int Scale = -53 - Shift;
1516 if (Exponent >= 1968)
1517 Scale += 128;
1518
1519 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1520 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1521 }
1522 case Intrinsic::amdgcn_fmul_legacy: {
1523 Value *Op0 = II.getArgOperand(0);
1524 Value *Op1 = II.getArgOperand(1);
1525
1526 for (Value *Src : {Op0, Op1}) {
1527 if (isa<PoisonValue>(Src))
1528 return IC.replaceInstUsesWith(II, Src);
1529 }
1530
1531 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1532 // infinity, gives +0.0.
1533 // TODO: Move to InstSimplify?
1534 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1536 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1537
1538 // If we can prove we don't have one of the special cases then we can use a
1539 // normal fmul instruction instead.
1540 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1541 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1542 FMul->takeName(&II);
1543 return IC.replaceInstUsesWith(II, FMul);
1544 }
1545 break;
1546 }
1547 case Intrinsic::amdgcn_fma_legacy: {
1548 Value *Op0 = II.getArgOperand(0);
1549 Value *Op1 = II.getArgOperand(1);
1550 Value *Op2 = II.getArgOperand(2);
1551
1552 for (Value *Src : {Op0, Op1, Op2}) {
1553 if (isa<PoisonValue>(Src))
1554 return IC.replaceInstUsesWith(II, Src);
1555 }
1556
1557 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1558 // infinity, gives +0.0.
1559 // TODO: Move to InstSimplify?
1560 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1562 // It's tempting to just return Op2 here, but that would give the wrong
1563 // result if Op2 was -0.0.
1564 auto *Zero = ConstantFP::getZero(II.getType());
1565 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1566 FAdd->takeName(&II);
1567 return IC.replaceInstUsesWith(II, FAdd);
1568 }
1569
1570 // If we can prove we don't have one of the special cases then we can use a
1571 // normal fma instead.
1572 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1573 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1574 II.getModule(), Intrinsic::fma, II.getType()));
1575 return &II;
1576 }
1577 break;
1578 }
1579 case Intrinsic::amdgcn_is_shared:
1580 case Intrinsic::amdgcn_is_private: {
1581 Value *Src = II.getArgOperand(0);
1582 if (isa<PoisonValue>(Src))
1583 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1584 if (isa<UndefValue>(Src))
1585 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1586
1587 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1588 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1589 break;
1590 }
1591 case Intrinsic::amdgcn_make_buffer_rsrc: {
1592 Value *Src = II.getArgOperand(0);
1593 if (isa<PoisonValue>(Src))
1594 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1595 return std::nullopt;
1596 }
1597 case Intrinsic::amdgcn_raw_buffer_store_format:
1598 case Intrinsic::amdgcn_struct_buffer_store_format:
1599 case Intrinsic::amdgcn_raw_tbuffer_store:
1600 case Intrinsic::amdgcn_struct_tbuffer_store:
1601 case Intrinsic::amdgcn_image_store_1d:
1602 case Intrinsic::amdgcn_image_store_1darray:
1603 case Intrinsic::amdgcn_image_store_2d:
1604 case Intrinsic::amdgcn_image_store_2darray:
1605 case Intrinsic::amdgcn_image_store_2darraymsaa:
1606 case Intrinsic::amdgcn_image_store_2dmsaa:
1607 case Intrinsic::amdgcn_image_store_3d:
1608 case Intrinsic::amdgcn_image_store_cube:
1609 case Intrinsic::amdgcn_image_store_mip_1d:
1610 case Intrinsic::amdgcn_image_store_mip_1darray:
1611 case Intrinsic::amdgcn_image_store_mip_2d:
1612 case Intrinsic::amdgcn_image_store_mip_2darray:
1613 case Intrinsic::amdgcn_image_store_mip_3d:
1614 case Intrinsic::amdgcn_image_store_mip_cube: {
1615 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1616 break;
1617
1618 APInt DemandedElts;
1620 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1621 else if (ST->hasDefaultComponentZero())
1622 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1623 else
1624 break;
1625
1626 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1627 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1628 false)) {
1629 return IC.eraseInstFromFunction(II);
1630 }
1631
1632 break;
1633 }
1634 case Intrinsic::amdgcn_prng_b32: {
1635 auto *Src = II.getArgOperand(0);
1636 if (isa<UndefValue>(Src)) {
1637 return IC.replaceInstUsesWith(II, Src);
1638 }
1639 return std::nullopt;
1640 }
1641 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1642 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1643 Value *Src0 = II.getArgOperand(0);
1644 Value *Src1 = II.getArgOperand(1);
1645 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
1646 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
1647 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1648 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1649
1650 auto getFormatNumRegs = [](unsigned FormatVal) {
1651 switch (FormatVal) {
1654 return 6u;
1656 return 4u;
1659 return 8u;
1660 default:
1661 llvm_unreachable("invalid format value");
1662 }
1663 };
1664
1665 bool MadeChange = false;
1666 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1667 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1668
1669 // Depending on the used format, fewer registers are required so shrink the
1670 // vector type.
1671 if (Src0Ty->getNumElements() > Src0NumElts) {
1672 Src0 = IC.Builder.CreateExtractVector(
1673 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1674 uint64_t(0));
1675 MadeChange = true;
1676 }
1677
1678 if (Src1Ty->getNumElements() > Src1NumElts) {
1679 Src1 = IC.Builder.CreateExtractVector(
1680 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1681 uint64_t(0));
1682 MadeChange = true;
1683 }
1684
1685 if (!MadeChange)
1686 return std::nullopt;
1687
1688 SmallVector<Value *, 10> Args(II.args());
1689 Args[0] = Src0;
1690 Args[1] = Src1;
1691
1692 CallInst *NewII = IC.Builder.CreateIntrinsic(
1693 IID, {Src0->getType(), Src1->getType()}, Args, &II);
1694 NewII->takeName(&II);
1695 return IC.replaceInstUsesWith(II, NewII);
1696 }
1697 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1698 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1699 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1700 Value *Src0 = II.getArgOperand(1);
1701 Value *Src1 = II.getArgOperand(3);
1702 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1703 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
1704 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1705 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1706
1707 bool MadeChange = false;
1708 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
1709 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
1710
1711 // Depending on the used format, fewer registers are required so shrink the
1712 // vector type.
1713 if (Src0Ty->getNumElements() > Src0NumElts) {
1714 Src0 = IC.Builder.CreateExtractVector(
1715 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1716 IC.Builder.getInt64(0));
1717 MadeChange = true;
1718 }
1719
1720 if (Src1Ty->getNumElements() > Src1NumElts) {
1721 Src1 = IC.Builder.CreateExtractVector(
1722 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1723 IC.Builder.getInt64(0));
1724 MadeChange = true;
1725 }
1726
1727 if (!MadeChange)
1728 return std::nullopt;
1729
1730 SmallVector<Value *, 13> Args(II.args());
1731 Args[1] = Src0;
1732 Args[3] = Src1;
1733
1734 CallInst *NewII = IC.Builder.CreateIntrinsic(
1735 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
1736 Args, &II);
1737 NewII->takeName(&II);
1738 return IC.replaceInstUsesWith(II, NewII);
1739 }
1740 }
1741 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1742 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1743 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1744 }
1745 return std::nullopt;
1746}
1747
1748/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1749///
1750/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1751/// definitions of the intrinsics vector argument, not Uses of the result like
1752/// image and buffer loads.
1753/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1754/// struct returns.
1757 APInt DemandedElts,
1758 int DMaskIdx, bool IsLoad) {
1759
1760 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1761 : II.getOperand(0)->getType());
1762 unsigned VWidth = IIVTy->getNumElements();
1763 if (VWidth == 1)
1764 return nullptr;
1765 Type *EltTy = IIVTy->getElementType();
1766
1769
1770 // Assume the arguments are unchanged and later override them, if needed.
1771 SmallVector<Value *, 16> Args(II.args());
1772
1773 if (DMaskIdx < 0) {
1774 // Buffer case.
1775
1776 const unsigned ActiveBits = DemandedElts.getActiveBits();
1777 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1778
1779 // Start assuming the prefix of elements is demanded, but possibly clear
1780 // some other bits if there are trailing zeros (unused components at front)
1781 // and update offset.
1782 DemandedElts = (1 << ActiveBits) - 1;
1783
1784 if (UnusedComponentsAtFront > 0) {
1785 static const unsigned InvalidOffsetIdx = 0xf;
1786
1787 unsigned OffsetIdx;
1788 switch (II.getIntrinsicID()) {
1789 case Intrinsic::amdgcn_raw_buffer_load:
1790 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1791 OffsetIdx = 1;
1792 break;
1793 case Intrinsic::amdgcn_s_buffer_load:
1794 // If resulting type is vec3, there is no point in trimming the
1795 // load with updated offset, as the vec3 would most likely be widened to
1796 // vec4 anyway during lowering.
1797 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1798 OffsetIdx = InvalidOffsetIdx;
1799 else
1800 OffsetIdx = 1;
1801 break;
1802 case Intrinsic::amdgcn_struct_buffer_load:
1803 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1804 OffsetIdx = 2;
1805 break;
1806 default:
1807 // TODO: handle tbuffer* intrinsics.
1808 OffsetIdx = InvalidOffsetIdx;
1809 break;
1810 }
1811
1812 if (OffsetIdx != InvalidOffsetIdx) {
1813 // Clear demanded bits and update the offset.
1814 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1815 auto *Offset = Args[OffsetIdx];
1816 unsigned SingleComponentSizeInBits =
1817 IC.getDataLayout().getTypeSizeInBits(EltTy);
1818 unsigned OffsetAdd =
1819 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1820 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1821 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1822 }
1823 }
1824 } else {
1825 // Image case.
1826
1827 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1828 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1829
1830 // dmask 0 has special semantics, do not simplify.
1831 if (DMaskVal == 0)
1832 return nullptr;
1833
1834 // Mask off values that are undefined because the dmask doesn't cover them
1835 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1836
1837 unsigned NewDMaskVal = 0;
1838 unsigned OrigLdStIdx = 0;
1839 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1840 const unsigned Bit = 1 << SrcIdx;
1841 if (!!(DMaskVal & Bit)) {
1842 if (!!DemandedElts[OrigLdStIdx])
1843 NewDMaskVal |= Bit;
1844 OrigLdStIdx++;
1845 }
1846 }
1847
1848 if (DMaskVal != NewDMaskVal)
1849 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1850 }
1851
1852 unsigned NewNumElts = DemandedElts.popcount();
1853 if (!NewNumElts)
1854 return PoisonValue::get(IIVTy);
1855
1856 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1857 if (DMaskIdx >= 0)
1858 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1859 return nullptr;
1860 }
1861
1862 // Validate function argument and return types, extracting overloaded types
1863 // along the way.
1864 SmallVector<Type *, 6> OverloadTys;
1865 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1866 return nullptr;
1867
1868 Type *NewTy =
1869 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1870 OverloadTys[0] = NewTy;
1871
1872 if (!IsLoad) {
1873 SmallVector<int, 8> EltMask;
1874 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1875 if (DemandedElts[OrigStoreIdx])
1876 EltMask.push_back(OrigStoreIdx);
1877
1878 if (NewNumElts == 1)
1879 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1880 else
1881 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1882 }
1883
1884 CallInst *NewCall =
1885 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
1886 NewCall->takeName(&II);
1887 NewCall->copyMetadata(II);
1888
1889 if (IsLoad) {
1890 if (NewNumElts == 1) {
1891 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1892 DemandedElts.countr_zero());
1893 }
1894
1895 SmallVector<int, 8> EltMask;
1896 unsigned NewLoadIdx = 0;
1897 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1898 if (!!DemandedElts[OrigLoadIdx])
1899 EltMask.push_back(NewLoadIdx++);
1900 else
1901 EltMask.push_back(NewNumElts);
1902 }
1903
1904 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1905
1906 return Shuffle;
1907 }
1908
1909 return NewCall;
1910}
1911
1913 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
1914 APInt &UndefElts) const {
1915 auto *VT = dyn_cast<FixedVectorType>(II.getType());
1916 if (!VT)
1917 return nullptr;
1918
1919 const unsigned FirstElt = DemandedElts.countr_zero();
1920 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
1921 const unsigned MaskLen = LastElt - FirstElt + 1;
1922
1923 unsigned OldNumElts = VT->getNumElements();
1924 if (MaskLen == OldNumElts && MaskLen != 1)
1925 return nullptr;
1926
1927 Type *EltTy = VT->getElementType();
1928 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
1929
1930 // Theoretically we should support these intrinsics for any legal type. Avoid
1931 // introducing cases that aren't direct register types like v3i16.
1932 if (!isTypeLegal(NewVT))
1933 return nullptr;
1934
1935 Value *Src = II.getArgOperand(0);
1936
1937 // Make sure convergence tokens are preserved.
1938 // TODO: CreateIntrinsic should allow directly copying bundles
1940 II.getOperandBundlesAsDefs(OpBundles);
1941
1943 Function *Remangled =
1944 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
1945
1946 if (MaskLen == 1) {
1947 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
1948
1949 // TODO: Preserve callsite attributes?
1950 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1951
1952 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
1953 NewCall, FirstElt);
1954 }
1955
1956 SmallVector<int> ExtractMask(MaskLen, -1);
1957 for (unsigned I = 0; I != MaskLen; ++I) {
1958 if (DemandedElts[FirstElt + I])
1959 ExtractMask[I] = FirstElt + I;
1960 }
1961
1962 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
1963
1964 // TODO: Preserve callsite attributes?
1965 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1966
1967 SmallVector<int> InsertMask(OldNumElts, -1);
1968 for (unsigned I = 0; I != MaskLen; ++I) {
1969 if (DemandedElts[FirstElt + I])
1970 InsertMask[FirstElt + I] = I;
1971 }
1972
1973 // FIXME: If the call has a convergence bundle, we end up leaving the dead
1974 // call behind.
1975 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
1976}
1977
1979 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1980 APInt &UndefElts2, APInt &UndefElts3,
1981 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1982 SimplifyAndSetOp) const {
1983 switch (II.getIntrinsicID()) {
1984 case Intrinsic::amdgcn_readfirstlane:
1985 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1986 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
1987 case Intrinsic::amdgcn_raw_buffer_load:
1988 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1989 case Intrinsic::amdgcn_raw_buffer_load_format:
1990 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1991 case Intrinsic::amdgcn_raw_tbuffer_load:
1992 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1993 case Intrinsic::amdgcn_s_buffer_load:
1994 case Intrinsic::amdgcn_struct_buffer_load:
1995 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1996 case Intrinsic::amdgcn_struct_buffer_load_format:
1997 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1998 case Intrinsic::amdgcn_struct_tbuffer_load:
1999 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2000 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2001 default: {
2002 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2003 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2004 }
2005 break;
2006 }
2007 }
2008 return std::nullopt;
2009}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
unsigned Intr
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static APInt defaultComponentBroadcast(Value *V)
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
#define I(x, y, z)
Definition: MD5.cpp:58
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
if(PassOpts->AAPipeline)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1120
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1208
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:6057
bool isPosInfinity() const
Definition: APFloat.h:1462
const fltSemantics & getSemantics() const
Definition: APFloat.h:1457
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition: APFloat.h:1316
bool isNaN() const
Definition: APFloat.h:1447
bool isSignaling() const
Definition: APFloat.h:1451
APInt bitcastToAPInt() const
Definition: APFloat.h:1353
bool isNegInfinity() const
Definition: APFloat.h:1463
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1079
cmpResult compare(const APFloat &RHS) const
Definition: APFloat.h:1404
bool isInfinity() const
Definition: APFloat.h:1446
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
bool isMask(unsigned numBits) const
Definition: APInt.h:488
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:248
bool isTypeLegal(Type *Ty) const override
Definition: BasicTTIImpl.h:548
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1481
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:709
@ FIRST_FCMP_PREDICATE
Definition: InstrTypes.h:696
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
bool isSigned() const
Definition: InstrTypes.h:932
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:829
bool isFPPredicate() const
Definition: InstrTypes.h:784
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:277
const APFloat & getValueAPF() const
Definition: Constants.h:320
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
Definition: Constants.cpp:1105
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
Definition: Constants.cpp:1059
static LLVM_ABI Constant * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
Definition: Constants.cpp:1026
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:333
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition: Operator.h:328
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
bool allowContract() const
Definition: FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
bool hasDefaultComponentZero() const
Definition: GCNSubtarget.h:965
bool hasMed3_16() const
Definition: GCNSubtarget.h:466
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasDefaultComponentBroadcast() const
Definition: GCNSubtarget.h:967
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1093
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2333
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition: IRBuilder.h:1024
Value * CreateFPCast(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2298
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition: IRBuilder.h:1052
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition: IRBuilder.h:1012
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1618
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition: IRBuilder.h:1046
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1656
The core instruction combiner logic.
Definition: InstCombiner.h:48
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:337
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:336
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
const SimplifyQuery & getSimplifyQuery() const
Definition: InstCombiner.h:338
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Metadata node.
Definition: Metadata.h:1077
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:607
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition: Metadata.cpp:103
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
bool empty() const
Definition: SmallVector.h:82
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:245
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:166
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
const ParentTy * getParent() const
Definition: ilist_node.h:34
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
Definition: PatternMatch.h:766
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
class_match< ConstantFP > m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
Definition: PatternMatch.h:173
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
Definition: PatternMatch.h:754
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:316
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition: APFloat.h:1555
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition: APFloat.h:1598
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition: APFloat.h:1543
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition: MathExtras.h:169
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition: APFloat.h:294
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:308
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:60
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.