LLVM 22.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Dominators.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/InstVisitor.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/Pass.h"
36
37#define DEBUG_TYPE "amdgpu-codegenprepare"
38
39using namespace llvm;
40using namespace llvm::PatternMatch;
41
42namespace {
43
45 "amdgpu-codegenprepare-widen-constant-loads",
46 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
48 cl::init(false));
49
50static cl::opt<bool>
51 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
52 cl::desc("Break large PHI nodes for DAGISel"),
54
55static cl::opt<bool>
56 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
57 cl::desc("For testing purposes, always break large "
58 "PHIs even if it isn't profitable."),
60
61static cl::opt<unsigned> BreakLargePHIsThreshold(
62 "amdgpu-codegenprepare-break-large-phis-threshold",
63 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
65
66static cl::opt<bool> UseMul24Intrin(
67 "amdgpu-codegenprepare-mul24",
68 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
70 cl::init(true));
71
72// Legalize 64-bit division by using the generic IR expansion.
73static cl::opt<bool> ExpandDiv64InIR(
74 "amdgpu-codegenprepare-expand-div64",
75 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
77 cl::init(false));
78
79// Leave all division operations as they are. This supersedes ExpandDiv64InIR
80// and is used for testing the legalizer.
81static cl::opt<bool> DisableIDivExpand(
82 "amdgpu-codegenprepare-disable-idiv-expansion",
83 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
85 cl::init(false));
86
87// Disable processing of fdiv so we can better test the backend implementations.
88static cl::opt<bool> DisableFDivExpand(
89 "amdgpu-codegenprepare-disable-fdiv-expansion",
90 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
92 cl::init(false));
93
94class AMDGPUCodeGenPrepareImpl
95 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
96public:
97 Function &F;
98 const GCNSubtarget &ST;
99 const AMDGPUTargetMachine &TM;
100 const TargetLibraryInfo *TLI;
101 AssumptionCache *AC;
102 const DominatorTree *DT;
103 const UniformityInfo &UA;
104 const DataLayout &DL;
105 const bool HasFP32DenormalFlush;
106 bool FlowChanged = false;
107 mutable Function *SqrtF32 = nullptr;
108 mutable Function *LdexpF32 = nullptr;
109
110 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
111
112 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
113 const TargetLibraryInfo *TLI, AssumptionCache *AC,
114 const DominatorTree *DT, const UniformityInfo &UA)
115 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
116 DT(DT), UA(UA), DL(F.getDataLayout()),
117 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
119
120 Function *getSqrtF32() const {
121 if (SqrtF32)
122 return SqrtF32;
123
124 LLVMContext &Ctx = F.getContext();
126 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
127 return SqrtF32;
128 }
129
130 Function *getLdexpF32() const {
131 if (LdexpF32)
132 return LdexpF32;
133
134 LLVMContext &Ctx = F.getContext();
136 F.getParent(), Intrinsic::ldexp,
137 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
138 return LdexpF32;
139 }
140
141 bool canBreakPHINode(const PHINode &I);
142
143 /// \returns True if binary operation \p I is a signed binary operation, false
144 /// otherwise.
145 bool isSigned(const BinaryOperator &I) const;
146
147 /// \returns True if the condition of 'select' operation \p I comes from a
148 /// signed 'icmp' operation, false otherwise.
149 bool isSigned(const SelectInst &I) const;
150
151 /// Return true if \p T is a legal scalar floating point type.
152 bool isLegalFloatingTy(const Type *T) const;
153
154 /// Wrapper to pass all the arguments to computeKnownFPClass
156 const Instruction *CtxI) const {
157 return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
158 }
159
160 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
161 return HasFP32DenormalFlush ||
163 }
164
165 /// \returns The minimum number of bits needed to store the value of \Op as an
166 /// unsigned integer. Truncating to this size and then zero-extending to
167 /// the original will not change the value.
168 unsigned numBitsUnsigned(Value *Op) const;
169
170 /// \returns The minimum number of bits needed to store the value of \Op as a
171 /// signed integer. Truncating to this size and then sign-extending to
172 /// the original size will not change the value.
173 unsigned numBitsSigned(Value *Op) const;
174
175 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
176 /// SelectionDAG has an issue where an and asserting the bits are known
177 bool replaceMulWithMul24(BinaryOperator &I) const;
178
179 /// Perform same function as equivalently named function in DAGCombiner. Since
180 /// we expand some divisions here, we need to perform this before obscuring.
181 bool foldBinOpIntoSelect(BinaryOperator &I) const;
182
183 bool divHasSpecialOptimization(BinaryOperator &I,
184 Value *Num, Value *Den) const;
185 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
186 unsigned MaxDivBits, bool Signed) const;
187
188 /// Expands 24 bit div or rem.
189 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den,
191 bool IsDiv, bool IsSigned) const;
192
193 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
194 Value *Num, Value *Den, unsigned NumBits,
195 bool IsDiv, bool IsSigned) const;
196
197 /// Expands 32 bit div or rem.
198 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
199 Value *Num, Value *Den) const;
200
201 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
202 Value *Num, Value *Den) const;
203 void expandDivRem64(BinaryOperator &I) const;
204
205 /// Widen a scalar load.
206 ///
207 /// \details \p Widen scalar load for uniform, small type loads from constant
208 // memory / to a full 32-bits and then truncate the input to allow a scalar
209 // load instead of a vector load.
210 //
211 /// \returns True.
212
213 bool canWidenScalarExtLoad(LoadInst &I) const;
214
215 Value *matchFractPat(IntrinsicInst &I);
216 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
217
218 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
219 FastMathFlags SqrtFMF) const;
220
221 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
222 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
223 const Instruction *CtxI) const;
224
225 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags FMF, const Instruction *CtxI) const;
227 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
228 float ReqdAccuracy) const;
229
230 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
231 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
232 Value *RsqOp, const Instruction *FDiv,
233 float ReqdAccuracy) const;
234
235 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
236 Value *Src) const;
237
238 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
239 bool IsNegative) const;
240 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
241 FastMathFlags FMF) const;
242 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
243 FastMathFlags FMF) const;
244
245public:
246 bool visitFDiv(BinaryOperator &I);
247
248 bool visitInstruction(Instruction &I) { return false; }
250 bool visitLoadInst(LoadInst &I);
252 bool visitPHINode(PHINode &I);
254
256 bool visitFMinLike(IntrinsicInst &I);
257 bool visitSqrt(IntrinsicInst &I);
258 bool run();
259};
260
261class AMDGPUCodeGenPrepare : public FunctionPass {
262public:
263 static char ID;
264 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
265 void getAnalysisUsage(AnalysisUsage &AU) const override {
269
270 // FIXME: Division expansion needs to preserve the dominator tree.
271 if (!ExpandDiv64InIR)
272 AU.setPreservesAll();
273 }
274 bool runOnFunction(Function &F) override;
275 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
276};
277
278} // end anonymous namespace
279
280bool AMDGPUCodeGenPrepareImpl::run() {
281 BreakPhiNodesCache.clear();
282 bool MadeChange = false;
283
284 Function::iterator NextBB;
285 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
286 BasicBlock *BB = &*FI;
287 NextBB = std::next(FI);
288
290 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
291 I = Next) {
292 Next = std::next(I);
293
294 MadeChange |= visit(*I);
295
296 if (Next != E) { // Control flow changed
297 BasicBlock *NextInstBB = Next->getParent();
298 if (NextInstBB != BB) {
299 BB = NextInstBB;
300 E = BB->end();
301 FE = F.end();
302 }
303 }
304 }
305 }
306 return MadeChange;
307}
308
309bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
310 return I.getOpcode() == Instruction::AShr ||
311 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
312}
313
314bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
315 return isa<ICmpInst>(I.getOperand(0)) &&
316 cast<ICmpInst>(I.getOperand(0))->isSigned();
317}
318
319bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
320 return Ty->isFloatTy() || Ty->isDoubleTy() ||
321 (Ty->isHalfTy() && ST.has16BitInsts());
322}
323
324bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
325 Type *Ty = I.getType();
326 int TySize = DL.getTypeSizeInBits(Ty);
327 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
328
329 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
330}
331
332unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
333 return computeKnownBits(Op, DL, AC).countMaxActiveBits();
334}
335
336unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
337 return ComputeMaxSignificantBits(Op, DL, AC);
338}
339
340static void extractValues(IRBuilder<> &Builder,
341 SmallVectorImpl<Value *> &Values, Value *V) {
342 auto *VT = dyn_cast<FixedVectorType>(V->getType());
343 if (!VT) {
344 Values.push_back(V);
345 return;
346 }
347
348 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
349 Values.push_back(Builder.CreateExtractElement(V, I));
350}
351
353 Type *Ty,
354 SmallVectorImpl<Value *> &Values) {
355 if (!Ty->isVectorTy()) {
356 assert(Values.size() == 1);
357 return Values[0];
358 }
359
360 Value *NewVal = PoisonValue::get(Ty);
361 for (int I = 0, E = Values.size(); I != E; ++I)
362 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
363
364 return NewVal;
365}
366
367bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
368 if (I.getOpcode() != Instruction::Mul)
369 return false;
370
371 Type *Ty = I.getType();
372 unsigned Size = Ty->getScalarSizeInBits();
373 if (Size <= 16 && ST.has16BitInsts())
374 return false;
375
376 // Prefer scalar if this could be s_mul_i32
377 if (UA.isUniform(&I))
378 return false;
379
380 Value *LHS = I.getOperand(0);
381 Value *RHS = I.getOperand(1);
382 IRBuilder<> Builder(&I);
383 Builder.SetCurrentDebugLocation(I.getDebugLoc());
384
385 unsigned LHSBits = 0, RHSBits = 0;
386 bool IsSigned = false;
387
388 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
389 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
390 IsSigned = false;
391
392 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
393 (RHSBits = numBitsSigned(RHS)) <= 24) {
394 IsSigned = true;
395
396 } else
397 return false;
398
401 SmallVector<Value *, 4> ResultVals;
402 extractValues(Builder, LHSVals, LHS);
403 extractValues(Builder, RHSVals, RHS);
404
405 IntegerType *I32Ty = Builder.getInt32Ty();
406 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
407 Type *DstTy = LHSVals[0]->getType();
408
409 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
410 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
411 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
412 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
413 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
415 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
416 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
417 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
418 : Builder.CreateZExtOrTrunc(Result, DstTy);
419 ResultVals.push_back(Result);
420 }
421
422 Value *NewVal = insertValues(Builder, Ty, ResultVals);
423 NewVal->takeName(&I);
424 I.replaceAllUsesWith(NewVal);
425 I.eraseFromParent();
426
427 return true;
428}
429
430// Find a select instruction, which may have been casted. This is mostly to deal
431// with cases where i16 selects were promoted here to i32.
433 Cast = nullptr;
434 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
435 return Sel;
436
437 if ((Cast = dyn_cast<CastInst>(V))) {
438 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
439 return Sel;
440 }
441
442 return nullptr;
443}
444
445bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
446 // Don't do this unless the old select is going away. We want to eliminate the
447 // binary operator, not replace a binop with a select.
448 int SelOpNo = 0;
449
450 CastInst *CastOp;
451
452 // TODO: Should probably try to handle some cases with multiple
453 // users. Duplicating the select may be profitable for division.
454 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
455 if (!Sel || !Sel->hasOneUse()) {
456 SelOpNo = 1;
457 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
458 }
459
460 if (!Sel || !Sel->hasOneUse())
461 return false;
462
463 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
464 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
465 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
466 if (!CBO || !CT || !CF)
467 return false;
468
469 if (CastOp) {
470 if (!CastOp->hasOneUse())
471 return false;
472 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
473 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
474 }
475
476 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
477 // need to handle divisions here.
478 Constant *FoldedT =
479 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
480 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
481 if (!FoldedT || isa<ConstantExpr>(FoldedT))
482 return false;
483
484 Constant *FoldedF =
485 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
486 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
487 if (!FoldedF || isa<ConstantExpr>(FoldedF))
488 return false;
489
490 IRBuilder<> Builder(&BO);
491 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
492 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
493 Builder.setFastMathFlags(FPOp->getFastMathFlags());
494
495 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
496 FoldedT, FoldedF);
497 NewSelect->takeName(&BO);
498 BO.replaceAllUsesWith(NewSelect);
499 BO.eraseFromParent();
500 if (CastOp)
501 CastOp->eraseFromParent();
502 Sel->eraseFromParent();
503 return true;
504}
505
506std::pair<Value *, Value *>
507AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
508 Value *Src) const {
509 Type *Ty = Src->getType();
510 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
511 {Ty, Builder.getInt32Ty()}, Src);
512 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
513
514 // Bypass the bug workaround for the exponent result since it doesn't matter.
515 // TODO: Does the bug workaround even really need to consider the exponent
516 // result? It's unspecified by the spec.
517
518 Value *FrexpExp =
519 ST.hasFractBug()
520 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
521 {Builder.getInt32Ty(), Ty}, Src)
522 : Builder.CreateExtractValue(Frexp, {1});
523 return {FrexpMant, FrexpExp};
524}
525
526/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
527Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
528 Value *Src,
529 bool IsNegative) const {
530 // Same as for 1.0, but expand the sign out of the constant.
531 // -1.0 / x -> rcp (fneg x)
532 if (IsNegative)
533 Src = Builder.CreateFNeg(Src);
534
535 // The rcp instruction doesn't support denormals, so scale the input
536 // out of the denormal range and convert at the end.
537 //
538 // Expand as 2^-n * (1.0 / (x * 2^n))
539
540 // TODO: Skip scaling if input is known never denormal and the input
541 // range won't underflow to denormal. The hard part is knowing the
542 // result. We need a range check, the result could be denormal for
543 // 0x1p+126 < den <= 0x1p+127.
544 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
545 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
546 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
547 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
548}
549
550/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
551Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
552 Value *RHS,
553 FastMathFlags FMF) const {
554 // If we have have to work around the fract/frexp bug, we're worse off than
555 // using the fdiv.fast expansion. The full safe expansion is faster if we have
556 // fast FMA.
557 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
558 (!FMF.noNaNs() || !FMF.noInfs()))
559 return nullptr;
560
561 // We're scaling the LHS to avoid a denormal input, and scale the denominator
562 // to avoid large values underflowing the result.
563 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
564
565 Value *Rcp =
566 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
567
568 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
569 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
570
571 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
572 // result.
573 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
574 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
575}
576
577/// Emit a sqrt that handles denormals and is accurate to 2ulp.
578Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
579 Value *Src,
580 FastMathFlags FMF) const {
581 Type *Ty = Src->getType();
582 APFloat SmallestNormal =
584 Value *NeedScale =
585 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
586
587 ConstantInt *Zero = Builder.getInt32(0);
588 Value *InputScaleFactor =
589 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
590
591 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
592
593 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
594
595 Value *OutputScaleFactor =
596 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
597 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
598}
599
600/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
601static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
602 bool IsNegative) {
603 // bool need_scale = x < 0x1p-126f;
604 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
605 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
606 // rsq(x * input_scale) * output_scale;
607
608 Type *Ty = Src->getType();
609 APFloat SmallestNormal =
611 Value *NeedScale =
612 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
613 Constant *One = ConstantFP::get(Ty, 1.0);
614 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
615 Constant *OutputScale =
616 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
617
618 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
619
620 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
621 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
622 Value *OutputScaleFactor = Builder.CreateSelect(
623 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
624
625 return Builder.CreateFMul(Rsq, OutputScaleFactor);
626}
627
628bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
629 FastMathFlags DivFMF,
630 FastMathFlags SqrtFMF) const {
631 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
632 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
633 return false;
634
635 // v_rsq_f32 gives 1ulp
636 return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
637}
638
639Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
640 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
641 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
642 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
643 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
644
645 // rsq_f16 is accurate to 0.51 ulp.
646 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
647 // rsq_f64 is never accurate.
648 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
649 if (!CLHS)
650 return nullptr;
651
652 assert(Den->getType()->isFloatTy());
653
654 bool IsNegative = false;
655
656 // TODO: Handle other numerator values with arcp.
657 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
658 // Add in the sqrt flags.
659 IRBuilder<>::FastMathFlagGuard Guard(Builder);
660 Builder.setFastMathFlags(DivFMF | SqrtFMF);
661
662 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
663 canIgnoreDenormalInput(Den, CtxI)) {
664 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
665 // -1.0 / sqrt(x) -> fneg(rsq(x))
666 return IsNegative ? Builder.CreateFNeg(Result) : Result;
667 }
668
669 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
670 }
671
672 return nullptr;
673}
674
675// Optimize fdiv with rcp:
676//
677// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
678// allowed with afn.
679//
680// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
681Value *
682AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
683 Value *Den, FastMathFlags FMF,
684 const Instruction *CtxI) const {
685 // rcp_f16 is accurate to 0.51 ulp.
686 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
687 // rcp_f64 is never accurate.
688 assert(Den->getType()->isFloatTy());
689
690 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
691 bool IsNegative = false;
692 if (CLHS->isExactlyValue(1.0) ||
693 (IsNegative = CLHS->isExactlyValue(-1.0))) {
694 Value *Src = Den;
695
696 if (HasFP32DenormalFlush || FMF.approxFunc()) {
697 // -1.0 / x -> 1.0 / fneg(x)
698 if (IsNegative)
699 Src = Builder.CreateFNeg(Src);
700
701 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
702 // the CI documentation has a worst case error of 1 ulp.
703 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
704 // to use it as long as we aren't trying to use denormals.
705 //
706 // v_rcp_f16 and v_rsq_f16 DO support denormals.
707
708 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
709 // insert rsq intrinsic here.
710
711 // 1.0 / x -> rcp(x)
712 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
713 }
714
715 // TODO: If the input isn't denormal, and we know the input exponent isn't
716 // big enough to introduce a denormal we can avoid the scaling.
717 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
718 }
719 }
720
721 if (FMF.allowReciprocal()) {
722 // x / y -> x * (1.0 / y)
723
724 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
725 // will never underflow.
726 if (HasFP32DenormalFlush || FMF.approxFunc()) {
727 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
728 return Builder.CreateFMul(Num, Recip);
729 }
730
731 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
732 return Builder.CreateFMul(Num, Recip);
733 }
734
735 return nullptr;
736}
737
738// optimize with fdiv.fast:
739//
740// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
741//
742// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
743//
744// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
745Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
746 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
747 // fdiv.fast can achieve 2.5 ULP accuracy.
748 if (ReqdAccuracy < 2.5f)
749 return nullptr;
750
751 // Only have fdiv.fast for f32.
752 assert(Den->getType()->isFloatTy());
753
754 bool NumIsOne = false;
755 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
756 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
757 NumIsOne = true;
758 }
759
760 // fdiv does not support denormals. But 1.0/x is always fine to use it.
761 //
762 // TODO: This works for any value with a specific known exponent range, don't
763 // just limit to constant 1.
764 if (!HasFP32DenormalFlush && !NumIsOne)
765 return nullptr;
766
767 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
768}
769
770Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
771 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
772 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
773 float ReqdDivAccuracy) const {
774 if (RsqOp) {
775 Value *Rsq =
776 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
777 if (Rsq)
778 return Rsq;
779 }
780
781 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
782 if (Rcp)
783 return Rcp;
784
785 // In the basic case fdiv_fast has the same instruction count as the frexp div
786 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
787 // potentially be fused into a user. Also, materialization of the constants
788 // can be reused for multiple instances.
789 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
790 if (FDivFast)
791 return FDivFast;
792
793 return emitFrexpDiv(Builder, Num, Den, DivFMF);
794}
795
796// Optimizations is performed based on fpmath, fast math flags as well as
797// denormals to optimize fdiv with either rcp or fdiv.fast.
798//
799// With rcp:
800// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
801// allowed with afn.
802//
803// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
804//
805// With fdiv.fast:
806// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
807//
808// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
809//
810// NOTE: rcp is the preference in cases that both are legal.
811bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
812 if (DisableFDivExpand)
813 return false;
814
815 Type *Ty = FDiv.getType()->getScalarType();
816 if (!Ty->isFloatTy())
817 return false;
818
819 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
820 // expansion around them in codegen. f16 is good enough to always use.
821
822 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
823 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
824 const float ReqdAccuracy = FPOp->getFPAccuracy();
825
826 FastMathFlags SqrtFMF;
827
828 Value *Num = FDiv.getOperand(0);
829 Value *Den = FDiv.getOperand(1);
830
831 Value *RsqOp = nullptr;
832 auto *DenII = dyn_cast<IntrinsicInst>(Den);
833 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
834 DenII->hasOneUse()) {
835 const auto *SqrtOp = cast<FPMathOperator>(DenII);
836 SqrtFMF = SqrtOp->getFastMathFlags();
837 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
838 RsqOp = SqrtOp->getOperand(0);
839 }
840
841 // Inaccurate rcp is allowed with afn.
842 //
843 // Defer to codegen to handle this.
844 //
845 // TODO: Decide on an interpretation for interactions between afn + arcp +
846 // !fpmath, and make it consistent between here and codegen. For now, defer
847 // expansion of afn to codegen. The current interpretation is so aggressive we
848 // don't need any pre-consideration here when we have better information. A
849 // more conservative interpretation could use handling here.
850 const bool AllowInaccurateRcp = DivFMF.approxFunc();
851 if (!RsqOp && AllowInaccurateRcp)
852 return false;
853
854 // Defer the correct implementations to codegen.
855 if (ReqdAccuracy < 1.0f)
856 return false;
857
858 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
859 Builder.setFastMathFlags(DivFMF);
860 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
861
864 SmallVector<Value *, 4> RsqDenVals;
865 extractValues(Builder, NumVals, Num);
866 extractValues(Builder, DenVals, Den);
867
868 if (RsqOp)
869 extractValues(Builder, RsqDenVals, RsqOp);
870
871 SmallVector<Value *, 4> ResultVals(NumVals.size());
872 for (int I = 0, E = NumVals.size(); I != E; ++I) {
873 Value *NumElt = NumVals[I];
874 Value *DenElt = DenVals[I];
875 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
876
877 Value *NewElt =
878 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
879 cast<Instruction>(FPOp), ReqdAccuracy);
880 if (!NewElt) {
881 // Keep the original, but scalarized.
882
883 // This has the unfortunate side effect of sometimes scalarizing when
884 // we're not going to do anything.
885 NewElt = Builder.CreateFDiv(NumElt, DenElt);
886 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
887 NewEltInst->copyMetadata(FDiv);
888 }
889
890 ResultVals[I] = NewElt;
891 }
892
893 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
894
895 if (NewVal) {
896 FDiv.replaceAllUsesWith(NewVal);
897 NewVal->takeName(&FDiv);
899 }
900
901 return true;
902}
903
904static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
905 Value *LHS, Value *RHS) {
906 Type *I32Ty = Builder.getInt32Ty();
907 Type *I64Ty = Builder.getInt64Ty();
908
909 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
910 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
911 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
912 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
913 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
914 Hi = Builder.CreateTrunc(Hi, I32Ty);
915 return std::pair(Lo, Hi);
916}
917
918static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
919 return getMul64(Builder, LHS, RHS).second;
920}
921
922/// Figure out how many bits are really needed for this division.
923/// \p MaxDivBits is an optimization hint to bypass the second
924/// ComputeNumSignBits/computeKnownBits call if the first one is
925/// insufficient.
926unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
927 Value *Den,
928 unsigned MaxDivBits,
929 bool IsSigned) const {
931 Den->getType()->getScalarSizeInBits());
932 unsigned SSBits = Num->getType()->getScalarSizeInBits();
933 if (IsSigned) {
934 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
935 // A sign bit needs to be reserved for shrinking.
936 unsigned DivBits = SSBits - RHSSignBits + 1;
937 if (DivBits > MaxDivBits)
938 return SSBits;
939
940 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
941
942 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
943 DivBits = SSBits - SignBits + 1;
944 return DivBits;
945 }
946
947 // All bits are used for unsigned division for Num or Den in range
948 // (SignedMax, UnsignedMax].
949 KnownBits Known = computeKnownBits(Den, DL, AC, &I);
950 if (Known.isNegative() || !Known.isNonNegative())
951 return SSBits;
952 unsigned RHSSignBits = Known.countMinLeadingZeros();
953 unsigned DivBits = SSBits - RHSSignBits;
954 if (DivBits > MaxDivBits)
955 return SSBits;
956
957 Known = computeKnownBits(Num, DL, AC, &I);
958 if (Known.isNegative() || !Known.isNonNegative())
959 return SSBits;
960 unsigned LHSSignBits = Known.countMinLeadingZeros();
961
962 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
963 DivBits = SSBits - SignBits;
964 return DivBits;
965}
966
967// The fractional part of a float is enough to accurately represent up to
968// a 24-bit signed integer.
969Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
970 BinaryOperator &I, Value *Num,
971 Value *Den, bool IsDiv,
972 bool IsSigned) const {
973 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
974 if (DivBits > 24)
975 return nullptr;
976 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
977}
978
979Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
980 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
981 unsigned DivBits, bool IsDiv, bool IsSigned) const {
982 Type *I32Ty = Builder.getInt32Ty();
983 Num = Builder.CreateTrunc(Num, I32Ty);
984 Den = Builder.CreateTrunc(Den, I32Ty);
985
986 Type *F32Ty = Builder.getFloatTy();
987 ConstantInt *One = Builder.getInt32(1);
988 Value *JQ = One;
989
990 if (IsSigned) {
991 // char|short jq = ia ^ ib;
992 JQ = Builder.CreateXor(Num, Den);
993
994 // jq = jq >> (bitsize - 2)
995 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
996
997 // jq = jq | 0x1
998 JQ = Builder.CreateOr(JQ, One);
999 }
1000
1001 // int ia = (int)LHS;
1002 Value *IA = Num;
1003
1004 // int ib, (int)RHS;
1005 Value *IB = Den;
1006
1007 // float fa = (float)ia;
1008 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1009 : Builder.CreateUIToFP(IA, F32Ty);
1010
1011 // float fb = (float)ib;
1012 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1013 : Builder.CreateUIToFP(IB,F32Ty);
1014
1015 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1016 Builder.getFloatTy(), {FB});
1017 Value *FQM = Builder.CreateFMul(FA, RCP);
1018
1019 // fq = trunc(fqm);
1020 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1021 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1022
1023 // float fqneg = -fq;
1024 Value *FQNeg = Builder.CreateFNeg(FQ);
1025
1026 // float fr = mad(fqneg, fb, fa);
1027 auto FMAD = !ST.hasMadMacF32Insts()
1028 ? Intrinsic::fma
1029 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1030 Value *FR = Builder.CreateIntrinsic(FMAD,
1031 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1032
1033 // int iq = (int)fq;
1034 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1035 : Builder.CreateFPToUI(FQ, I32Ty);
1036
1037 // fr = fabs(fr);
1038 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1039
1040 // fb = fabs(fb);
1041 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1042
1043 // int cv = fr >= fb;
1044 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1045
1046 // jq = (cv ? jq : 0);
1047 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1048
1049 // dst = iq + jq;
1050 Value *Div = Builder.CreateAdd(IQ, JQ);
1051
1052 Value *Res = Div;
1053 if (!IsDiv) {
1054 // Rem needs compensation, it's easier to recompute it
1055 Value *Rem = Builder.CreateMul(Div, Den);
1056 Res = Builder.CreateSub(Num, Rem);
1057 }
1058
1059 if (DivBits != 0 && DivBits < 32) {
1060 // Extend in register from the number of bits this divide really is.
1061 if (IsSigned) {
1062 int InRegBits = 32 - DivBits;
1063
1064 Res = Builder.CreateShl(Res, InRegBits);
1065 Res = Builder.CreateAShr(Res, InRegBits);
1066 } else {
1067 ConstantInt *TruncMask
1068 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1069 Res = Builder.CreateAnd(Res, TruncMask);
1070 }
1071 }
1072
1073 return Res;
1074}
1075
1076// Try to recognize special cases the DAG will emit special, better expansions
1077// than the general expansion we do here.
1078
1079// TODO: It would be better to just directly handle those optimizations here.
1080bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1081 Value *Num,
1082 Value *Den) const {
1083 if (Constant *C = dyn_cast<Constant>(Den)) {
1084 // Arbitrary constants get a better expansion as long as a wider mulhi is
1085 // legal.
1086 if (C->getType()->getScalarSizeInBits() <= 32)
1087 return true;
1088
1089 // TODO: Sdiv check for not exact for some reason.
1090
1091 // If there's no wider mulhi, there's only a better expansion for powers of
1092 // two.
1093 // TODO: Should really know for each vector element.
1094 if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
1095 return true;
1096
1097 return false;
1098 }
1099
1100 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1101 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1102 if (BinOpDen->getOpcode() == Instruction::Shl &&
1103 isa<Constant>(BinOpDen->getOperand(0)) &&
1104 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
1105 return true;
1106 }
1107 }
1108
1109 return false;
1110}
1111
1112static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1113 // Check whether the sign can be determined statically.
1114 KnownBits Known = computeKnownBits(V, DL);
1115 if (Known.isNegative())
1116 return Constant::getAllOnesValue(V->getType());
1117 if (Known.isNonNegative())
1118 return Constant::getNullValue(V->getType());
1119 return Builder.CreateAShr(V, Builder.getInt32(31));
1120}
1121
1122Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1124 Value *Y) const {
1125 Instruction::BinaryOps Opc = I.getOpcode();
1126 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1127 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1128
1129 FastMathFlags FMF;
1130 FMF.setFast();
1131 Builder.setFastMathFlags(FMF);
1132
1133 if (divHasSpecialOptimization(I, X, Y))
1134 return nullptr; // Keep it for later optimization.
1135
1136 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1137 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1138
1139 Type *Ty = X->getType();
1140 Type *I32Ty = Builder.getInt32Ty();
1141 Type *F32Ty = Builder.getFloatTy();
1142
1143 if (Ty->getScalarSizeInBits() != 32) {
1144 if (IsSigned) {
1145 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1146 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1147 } else {
1148 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1149 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1150 }
1151 }
1152
1153 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1154 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1155 Builder.CreateZExtOrTrunc(Res, Ty);
1156 }
1157
1158 ConstantInt *Zero = Builder.getInt32(0);
1159 ConstantInt *One = Builder.getInt32(1);
1160
1161 Value *Sign = nullptr;
1162 if (IsSigned) {
1163 Value *SignX = getSign32(X, Builder, DL);
1164 Value *SignY = getSign32(Y, Builder, DL);
1165 // Remainder sign is the same as LHS
1166 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1167
1168 X = Builder.CreateAdd(X, SignX);
1169 Y = Builder.CreateAdd(Y, SignY);
1170
1171 X = Builder.CreateXor(X, SignX);
1172 Y = Builder.CreateXor(Y, SignY);
1173 }
1174
1175 // The algorithm here is based on ideas from "Software Integer Division", Tom
1176 // Rodeheffer, August 2008.
1177 //
1178 // unsigned udiv(unsigned x, unsigned y) {
1179 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1180 // // that this is a lower bound on inv(y), even if some of the calculations
1181 // // round up.
1182 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1183 //
1184 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1185 // // Empirically this is guaranteed to give a "two-y" lower bound on
1186 // // inv(y).
1187 // z += umulh(z, -y * z);
1188 //
1189 // // Quotient/remainder estimate.
1190 // unsigned q = umulh(x, z);
1191 // unsigned r = x - q * y;
1192 //
1193 // // Two rounds of quotient/remainder refinement.
1194 // if (r >= y) {
1195 // ++q;
1196 // r -= y;
1197 // }
1198 // if (r >= y) {
1199 // ++q;
1200 // r -= y;
1201 // }
1202 //
1203 // return q;
1204 // }
1205
1206 // Initial estimate of inv(y).
1207 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1208 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1209 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1210 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1211 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1212
1213 // One round of UNR.
1214 Value *NegY = Builder.CreateSub(Zero, Y);
1215 Value *NegYZ = Builder.CreateMul(NegY, Z);
1216 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1217
1218 // Quotient/remainder estimate.
1219 Value *Q = getMulHu(Builder, X, Z);
1220 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1221
1222 // First quotient/remainder refinement.
1223 Value *Cond = Builder.CreateICmpUGE(R, Y);
1224 if (IsDiv)
1225 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1226 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1227
1228 // Second quotient/remainder refinement.
1229 Cond = Builder.CreateICmpUGE(R, Y);
1230 Value *Res;
1231 if (IsDiv)
1232 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1233 else
1234 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1235
1236 if (IsSigned) {
1237 Res = Builder.CreateXor(Res, Sign);
1238 Res = Builder.CreateSub(Res, Sign);
1239 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1240 } else {
1241 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1242 }
1243 return Res;
1244}
1245
1246Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1247 BinaryOperator &I, Value *Num,
1248 Value *Den) const {
1249 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1250 return nullptr; // Keep it for later optimization.
1251
1252 Instruction::BinaryOps Opc = I.getOpcode();
1253
1254 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1255 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1256
1257 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1258 if (NumDivBits > 32)
1259 return nullptr;
1260
1261 Value *Narrowed = nullptr;
1262 if (NumDivBits <= 24) {
1263 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1264 IsDiv, IsSigned);
1265 } else if (NumDivBits <= 32) {
1266 Narrowed = expandDivRem32(Builder, I, Num, Den);
1267 }
1268
1269 if (Narrowed) {
1270 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1271 Builder.CreateZExt(Narrowed, Num->getType());
1272 }
1273
1274 return nullptr;
1275}
1276
1277void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1278 Instruction::BinaryOps Opc = I.getOpcode();
1279 // Do the general expansion.
1280 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1282 return;
1283 }
1284
1285 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1287 return;
1288 }
1289
1290 llvm_unreachable("not a division");
1291}
1292
1293/*
1294This will cause non-byte load in consistency, for example:
1295```
1296 %load = load i1, ptr addrspace(4) %arg, align 4
1297 %zext = zext i1 %load to
1298 i64 %add = add i64 %zext
1299```
1300Instead of creating `s_and_b32 s0, s0, 1`,
1301it will create `s_and_b32 s0, s0, 0xff`.
1302We accept this change since the non-byte load assumes the upper bits
1303within the byte are all 0.
1304*/
1306 const SITargetLowering *TLI,
1307 const TargetTransformInfo &TTI,
1308 const DataLayout &DL) {
1309 unsigned Opc = I->getOpcode();
1310 Type *OldType = I->getType();
1311
1312 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1313 return false;
1314
1315 unsigned OrigBit = OldType->getScalarSizeInBits();
1316
1317 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1318 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1319 "Instruction::Mul.");
1320
1321 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1322
1323 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1324 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1325 if (!NewType)
1326 return false;
1327 unsigned NewBit = NewType->getIntegerBitWidth();
1328 if (NewBit >= OrigBit)
1329 return false;
1330 NewType = I->getType()->getWithNewBitWidth(NewBit);
1331
1332 // Old cost
1333 InstructionCost OldCost =
1335 // New cost of new op
1336 InstructionCost NewCost =
1338 // New cost of narrowing 2 operands (use trunc)
1339 int NumOfNonConstOps = 2;
1340 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1341 // Cannot be both constant, should be propagated
1342 NumOfNonConstOps = 1;
1343 }
1344 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1345 NewType, OldType,
1348 // New cost of zext narrowed result to original type
1349 NewCost +=
1350 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1352 if (NewCost >= OldCost)
1353 return false;
1354
1355 IRBuilder<> Builder(I);
1356 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1357 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1358 Value *Arith =
1359 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1360
1361 Value *Zext = Builder.CreateZExt(Arith, OldType);
1362 I->replaceAllUsesWith(Zext);
1363 I->eraseFromParent();
1364 return true;
1365}
1366
1367bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1368 if (foldBinOpIntoSelect(I))
1369 return true;
1370
1371 if (UseMul24Intrin && replaceMulWithMul24(I))
1372 return true;
1373 if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
1374 TM.getTargetTransformInfo(F), DL))
1375 return true;
1376
1377 bool Changed = false;
1378 Instruction::BinaryOps Opc = I.getOpcode();
1379 Type *Ty = I.getType();
1380 Value *NewDiv = nullptr;
1381 unsigned ScalarSize = Ty->getScalarSizeInBits();
1382
1384
1385 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1386 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1387 ScalarSize <= 64 &&
1388 !DisableIDivExpand) {
1389 Value *Num = I.getOperand(0);
1390 Value *Den = I.getOperand(1);
1391 IRBuilder<> Builder(&I);
1392 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1393
1394 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1395 NewDiv = PoisonValue::get(VT);
1396
1397 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1398 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1399 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1400
1401 Value *NewElt;
1402 if (ScalarSize <= 32) {
1403 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1404 if (!NewElt)
1405 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1406 } else {
1407 // See if this 64-bit division can be shrunk to 32/24-bits before
1408 // producing the general expansion.
1409 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1410 if (!NewElt) {
1411 // The general 64-bit expansion introduces control flow and doesn't
1412 // return the new value. Just insert a scalar copy and defer
1413 // expanding it.
1414 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1415 // CreateBinOp does constant folding. If the operands are constant,
1416 // it will return a Constant instead of a BinaryOperator.
1417 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1418 Div64ToExpand.push_back(NewEltBO);
1419 }
1420 }
1421
1422 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1423 NewEltI->copyIRFlags(&I);
1424
1425 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1426 }
1427 } else {
1428 if (ScalarSize <= 32)
1429 NewDiv = expandDivRem32(Builder, I, Num, Den);
1430 else {
1431 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1432 if (!NewDiv)
1433 Div64ToExpand.push_back(&I);
1434 }
1435 }
1436
1437 if (NewDiv) {
1438 I.replaceAllUsesWith(NewDiv);
1439 I.eraseFromParent();
1440 Changed = true;
1441 }
1442 }
1443
1444 if (ExpandDiv64InIR) {
1445 // TODO: We get much worse code in specially handled constant cases.
1446 for (BinaryOperator *Div : Div64ToExpand) {
1447 expandDivRem64(*Div);
1448 FlowChanged = true;
1449 Changed = true;
1450 }
1451 }
1452
1453 return Changed;
1454}
1455
1456bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1457 if (!WidenLoads)
1458 return false;
1459
1460 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1461 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1462 canWidenScalarExtLoad(I)) {
1463 IRBuilder<> Builder(&I);
1464 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1465
1466 Type *I32Ty = Builder.getInt32Ty();
1467 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1468 WidenLoad->copyMetadata(I);
1469
1470 // If we have range metadata, we need to convert the type, and not make
1471 // assumptions about the high bits.
1472 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1474 mdconst::extract<ConstantInt>(Range->getOperand(0));
1475
1476 if (Lower->isNullValue()) {
1477 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1478 } else {
1479 Metadata *LowAndHigh[] = {
1480 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1481 // Don't make assumptions about the high bits.
1482 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1483 };
1484
1485 WidenLoad->setMetadata(LLVMContext::MD_range,
1486 MDNode::get(F.getContext(), LowAndHigh));
1487 }
1488 }
1489
1490 int TySize = DL.getTypeSizeInBits(I.getType());
1491 Type *IntNTy = Builder.getIntNTy(TySize);
1492 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1493 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1494 I.replaceAllUsesWith(ValOrig);
1495 I.eraseFromParent();
1496 return true;
1497 }
1498
1499 return false;
1500}
1501
1502bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1503 Value *Cond = I.getCondition();
1504 Value *TrueVal = I.getTrueValue();
1505 Value *FalseVal = I.getFalseValue();
1506 Value *CmpVal;
1507 CmpPredicate Pred;
1508
1509 // Match fract pattern with nan check.
1510 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1511 return false;
1512
1513 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1514 if (!FPOp)
1515 return false;
1516
1517 IRBuilder<> Builder(&I);
1518 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1519
1520 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1521 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1522
1523 Value *Fract = nullptr;
1524 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1525 CmpVal == matchFractPat(*IIFalse)) {
1526 // isnan(x) ? x : fract(x)
1527 Fract = applyFractPat(Builder, CmpVal);
1528 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1529 CmpVal == matchFractPat(*IITrue)) {
1530 // !isnan(x) ? fract(x) : x
1531 Fract = applyFractPat(Builder, CmpVal);
1532 } else
1533 return false;
1534
1535 Fract->takeName(&I);
1536 I.replaceAllUsesWith(Fract);
1538 return true;
1539}
1540
1541static bool areInSameBB(const Value *A, const Value *B) {
1542 const auto *IA = dyn_cast<Instruction>(A);
1543 const auto *IB = dyn_cast<Instruction>(B);
1544 return IA && IB && IA->getParent() == IB->getParent();
1545}
1546
1547// Helper for breaking large PHIs that returns true when an extractelement on V
1548// is likely to be folded away by the DAG combiner.
1550 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1551 if (!FVT)
1552 return false;
1553
1554 const Value *CurVal = V;
1555
1556 // Check for insertelements, keeping track of the elements covered.
1557 BitVector EltsCovered(FVT->getNumElements());
1558 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1559 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1560
1561 // Non constant index/out of bounds index -> folding is unlikely.
1562 // The latter is more of a sanity check because canonical IR should just
1563 // have replaced those with poison.
1564 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1565 return false;
1566
1567 const auto *VecSrc = IE->getOperand(0);
1568
1569 // If the vector source is another instruction, it must be in the same basic
1570 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1571 // unlikely to be able to do anything interesting here.
1572 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1573 return false;
1574
1575 CurVal = VecSrc;
1576 EltsCovered.set(Idx->getZExtValue());
1577
1578 // All elements covered.
1579 if (EltsCovered.all())
1580 return true;
1581 }
1582
1583 // We either didn't find a single insertelement, or the insertelement chain
1584 // ended before all elements were covered. Check for other interesting values.
1585
1586 // Constants are always interesting because we can just constant fold the
1587 // extractelements.
1588 if (isa<Constant>(CurVal))
1589 return true;
1590
1591 // shufflevector is likely to be profitable if either operand is a constant,
1592 // or if either source is in the same block.
1593 // This is because shufflevector is most often lowered as a series of
1594 // insert/extract elements anyway.
1595 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1596 return isa<Constant>(SV->getOperand(1)) ||
1597 areInSameBB(SV, SV->getOperand(0)) ||
1598 areInSameBB(SV, SV->getOperand(1));
1599 }
1600
1601 return false;
1602}
1603
1604static void collectPHINodes(const PHINode &I,
1606 const auto [It, Inserted] = SeenPHIs.insert(&I);
1607 if (!Inserted)
1608 return;
1609
1610 for (const Value *Inc : I.incoming_values()) {
1611 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1612 collectPHINodes(*PhiInc, SeenPHIs);
1613 }
1614
1615 for (const User *U : I.users()) {
1616 if (const auto *PhiU = dyn_cast<PHINode>(U))
1617 collectPHINodes(*PhiU, SeenPHIs);
1618 }
1619}
1620
1621bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1622 // Check in the cache first.
1623 if (const auto It = BreakPhiNodesCache.find(&I);
1624 It != BreakPhiNodesCache.end())
1625 return It->second;
1626
1627 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1628 // recursively consider all its users and incoming values that are also PHI
1629 // nodes. We then make a decision about all of those PHIs at once. Either they
1630 // all get broken up, or none of them do. That way, we avoid cases where a
1631 // single PHI is/is not broken and we end up reforming/exploding a vector
1632 // multiple times, or even worse, doing it in a loop.
1634 collectPHINodes(I, WorkList);
1635
1636#ifndef NDEBUG
1637 // Check that none of the PHI nodes in the worklist are in the map. If some of
1638 // them are, it means we're not good enough at collecting related PHIs.
1639 for (const PHINode *WLP : WorkList) {
1640 assert(BreakPhiNodesCache.count(WLP) == 0);
1641 }
1642#endif
1643
1644 // To consider a PHI profitable to break, we need to see some interesting
1645 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1646 // must have one to consider all PHIs breakable.
1647 //
1648 // This threshold has been determined through performance testing.
1649 //
1650 // Note that the computation below is equivalent to
1651 //
1652 // (unsigned)ceil((K / 3.0) * 2)
1653 //
1654 // It's simply written this way to avoid mixing integral/FP arithmetic.
1655 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1656 unsigned NumBreakablePHIs = 0;
1657 bool CanBreak = false;
1658 for (const PHINode *Cur : WorkList) {
1659 // Don't break PHIs that have no interesting incoming values. That is, where
1660 // there is no clear opportunity to fold the "extractelement" instructions
1661 // we would add.
1662 //
1663 // Note: IC does not run after this pass, so we're only interested in the
1664 // foldings that the DAG combiner can do.
1665 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1666 if (++NumBreakablePHIs >= Threshold) {
1667 CanBreak = true;
1668 break;
1669 }
1670 }
1671 }
1672
1673 for (const PHINode *Cur : WorkList)
1674 BreakPhiNodesCache[Cur] = CanBreak;
1675
1676 return CanBreak;
1677}
1678
1679/// Helper class for "break large PHIs" (visitPHINode).
1680///
1681/// This represents a slice of a PHI's incoming value, which is made up of:
1682/// - The type of the slice (Ty)
1683/// - The index in the incoming value's vector where the slice starts (Idx)
1684/// - The number of elements in the slice (NumElts).
1685/// It also keeps track of the NewPHI node inserted for this particular slice.
1686///
1687/// Slice examples:
1688/// <4 x i64> -> Split into four i64 slices.
1689/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1690/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1691/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1693public:
1694 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1695 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1696
1697 Type *Ty = nullptr;
1698 unsigned Idx = 0;
1699 unsigned NumElts = 0;
1700 PHINode *NewPHI = nullptr;
1701
1702 /// Slice \p Inc according to the information contained within this slice.
1703 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1704 /// pair, it returns the same Sliced value as well.
1705 ///
1706 /// Note this *intentionally* does not return the same value for, say,
1707 /// [%bb.0, %0] & [%bb.1, %0] as:
1708 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1709 /// the value in bb.1 may not be reachable from bb.0 if it's its
1710 /// predecessor.)
1711 /// - We also want to make our extract instructions as local as possible so
1712 /// the DAG has better chances of folding them out. Duplicating them like
1713 /// that is beneficial in that regard.
1714 ///
1715 /// This is both a minor optimization to avoid creating duplicate
1716 /// instructions, but also a requirement for correctness. It is not forbidden
1717 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1718 /// returned a new value each time, those previously identical pairs would all
1719 /// have different incoming values (from the same block) and it'd cause a "PHI
1720 /// node has multiple entries for the same basic block with different incoming
1721 /// values!" verifier error.
1722 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1723 Value *&Res = SlicedVals[{BB, Inc}];
1724 if (Res)
1725 return Res;
1726
1728 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1729 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1730
1731 if (NumElts > 1) {
1733 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1734 Mask.push_back(K);
1735 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1736 } else
1737 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1738
1739 return Res;
1740 }
1741
1742private:
1744};
1745
1746bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1747 // Break-up fixed-vector PHIs into smaller pieces.
1748 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1749 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1750 //
1751 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1752 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1753 // With large, odd-sized PHIs we may end up needing many `build_vector`
1754 // operations with most elements being "undef". This inhibits a lot of
1755 // optimization opportunities and can result in unreasonably high register
1756 // pressure and the inevitable stack spilling.
1757 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1758 return false;
1759
1760 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1761 if (!FVT || FVT->getNumElements() == 1 ||
1762 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1763 return false;
1764
1765 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1766 return false;
1767
1768 std::vector<VectorSlice> Slices;
1769
1770 Type *EltTy = FVT->getElementType();
1771 {
1772 unsigned Idx = 0;
1773 // For 8/16 bits type, don't scalarize fully but break it up into as many
1774 // 32-bit slices as we can, and scalarize the tail.
1775 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1776 const unsigned NumElts = FVT->getNumElements();
1777 if (EltSize == 8 || EltSize == 16) {
1778 const unsigned SubVecSize = (32 / EltSize);
1779 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1780 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1781 Idx += SubVecSize)
1782 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1783 }
1784
1785 // Scalarize all remaining elements.
1786 for (; Idx < NumElts; ++Idx)
1787 Slices.emplace_back(EltTy, Idx, 1);
1788 }
1789
1790 assert(Slices.size() > 1);
1791
1792 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1793 // creating the necessary instruction to extract the relevant slices of each
1794 // incoming value.
1795 IRBuilder<> B(I.getParent());
1796 B.SetCurrentDebugLocation(I.getDebugLoc());
1797
1798 unsigned IncNameSuffix = 0;
1799 for (VectorSlice &S : Slices) {
1800 // We need to reset the build on each iteration, because getSlicedVal may
1801 // have inserted something into I's BB.
1802 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1803 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1804
1805 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1806 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1807 "largephi.extractslice" +
1808 std::to_string(IncNameSuffix++)),
1809 BB);
1810 }
1811 }
1812
1813 // And replace this PHI with a vector of all the previous PHI values.
1814 Value *Vec = PoisonValue::get(FVT);
1815 unsigned NameSuffix = 0;
1816 for (VectorSlice &S : Slices) {
1817 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1818 if (S.NumElts > 1)
1819 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1820 else
1821 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1822 }
1823
1824 I.replaceAllUsesWith(Vec);
1825 I.eraseFromParent();
1826 return true;
1827}
1828
1829/// \param V Value to check
1830/// \param DL DataLayout
1831/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1832/// \param AS Target Address Space
1833/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1834static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1835 const AMDGPUTargetMachine &TM, unsigned AS) {
1836 // Pointer cannot be null if it's a block address, GV or alloca.
1837 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1838 // it as the symbol could be null in such cases.
1839 if (isa<BlockAddress, GlobalValue, AllocaInst>(V))
1840 return true;
1841
1842 // Check nonnull arguments.
1843 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1844 return true;
1845
1846 // Check nonnull loads.
1847 if (const auto *Load = dyn_cast<LoadInst>(V);
1848 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1849 return true;
1850
1851 // getUnderlyingObject may have looked through another addrspacecast, although
1852 // the optimizable situations most likely folded out by now.
1853 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1854 return false;
1855
1856 // TODO: Calls that return nonnull?
1857
1858 // For all other things, use KnownBits.
1859 // We either use 0 or all bits set to indicate null, so check whether the
1860 // value can be zero or all ones.
1861 //
1862 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1863 // address spaces have non-zero null values.
1864 auto SrcPtrKB = computeKnownBits(V, DL);
1865 const auto NullVal = TM.getNullPointerValue(AS);
1866
1867 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1868 assert((NullVal == 0 || NullVal == -1) &&
1869 "don't know how to check for this null value!");
1870 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1871}
1872
1873bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1874 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1875 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1876 // worth supporting.
1877 if (I.getType()->isVectorTy())
1878 return false;
1879
1880 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1881 // This is only worthwhile for casts from/to priv/local to flat.
1882 const unsigned SrcAS = I.getSrcAddressSpace();
1883 const unsigned DstAS = I.getDestAddressSpace();
1884
1885 bool CanLower = false;
1886 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1887 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
1888 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1889 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1890 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1891 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1892 if (!CanLower)
1893 return false;
1894
1896 getUnderlyingObjects(I.getOperand(0), WorkList);
1897 if (!all_of(WorkList, [&](const Value *V) {
1898 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
1899 }))
1900 return false;
1901
1902 IRBuilder<> B(&I);
1903 auto *Intrin = B.CreateIntrinsic(
1904 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1905 I.replaceAllUsesWith(Intrin);
1906 I.eraseFromParent();
1907 return true;
1908}
1909
1910bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
1911 switch (I.getIntrinsicID()) {
1912 case Intrinsic::minnum:
1913 case Intrinsic::minimumnum:
1914 case Intrinsic::minimum:
1915 return visitFMinLike(I);
1916 case Intrinsic::sqrt:
1917 return visitSqrt(I);
1918 default:
1919 return false;
1920 }
1921}
1922
1923/// Match non-nan fract pattern.
1924/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1925/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1926/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1927///
1928/// If fract is a useful instruction for the subtarget. Does not account for the
1929/// nan handling; the instruction has a nan check on the input value.
1930Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
1931 if (ST.hasFractBug())
1932 return nullptr;
1933
1934 Intrinsic::ID IID = I.getIntrinsicID();
1935
1936 // The value is only used in contexts where we know the input isn't a nan, so
1937 // any of the fmin variants are fine.
1938 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
1939 IID != Intrinsic::minimumnum)
1940 return nullptr;
1941
1942 Type *Ty = I.getType();
1943 if (!isLegalFloatingTy(Ty->getScalarType()))
1944 return nullptr;
1945
1946 Value *Arg0 = I.getArgOperand(0);
1947 Value *Arg1 = I.getArgOperand(1);
1948
1949 const APFloat *C;
1950 if (!match(Arg1, m_APFloat(C)))
1951 return nullptr;
1952
1953 APFloat One(1.0);
1954 bool LosesInfo;
1955 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
1956
1957 // Match nextafter(1.0, -1)
1958 One.next(true);
1959 if (One != *C)
1960 return nullptr;
1961
1962 Value *FloorSrc;
1963 if (match(Arg0, m_FSub(m_Value(FloorSrc),
1964 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
1965 return FloorSrc;
1966 return nullptr;
1967}
1968
1969Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
1970 Value *FractArg) {
1971 SmallVector<Value *, 4> FractVals;
1972 extractValues(Builder, FractVals, FractArg);
1973
1974 SmallVector<Value *, 4> ResultVals(FractVals.size());
1975
1976 Type *Ty = FractArg->getType()->getScalarType();
1977 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
1978 ResultVals[I] =
1979 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
1980 }
1981
1982 return insertValues(Builder, FractArg->getType(), ResultVals);
1983}
1984
1985bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
1986 Value *FractArg = matchFractPat(I);
1987 if (!FractArg)
1988 return false;
1989
1990 // Match pattern for fract intrinsic in contexts where the nan check has been
1991 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
1992 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
1993 return false;
1994
1995 IRBuilder<> Builder(&I);
1996 FastMathFlags FMF = I.getFastMathFlags();
1997 FMF.setNoNaNs();
1998 Builder.setFastMathFlags(FMF);
1999
2000 Value *Fract = applyFractPat(Builder, FractArg);
2001 Fract->takeName(&I);
2002 I.replaceAllUsesWith(Fract);
2003
2005 return true;
2006}
2007
2008static bool isOneOrNegOne(const Value *Val) {
2009 const APFloat *C;
2010 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2011}
2012
2013// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2014bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2015 Type *Ty = Sqrt.getType()->getScalarType();
2016 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2017 return false;
2018
2019 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2020 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2021
2022 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2023 // of fast llvm.sqrt will give the raw instruction anyway.
2024 if (SqrtFMF.approxFunc())
2025 return false;
2026
2027 const float ReqdAccuracy = FPOp->getFPAccuracy();
2028
2029 // Defer correctly rounded expansion to codegen.
2030 if (ReqdAccuracy < 1.0f)
2031 return false;
2032
2033 // FIXME: This is an ugly hack for this pass using forward iteration instead
2034 // of reverse. If it worked like a normal combiner, the rsq would form before
2035 // we saw a sqrt call.
2036 auto *FDiv =
2037 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2038 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2039 FDiv->getFPAccuracy() >= 1.0f &&
2040 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2041 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2042 isOneOrNegOne(FDiv->getOperand(0)))
2043 return false;
2044
2045 Value *SrcVal = Sqrt.getOperand(0);
2046 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2047
2048 // The raw instruction is 1 ulp, but the correction for denormal handling
2049 // brings it to 2.
2050 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2051 return false;
2052
2053 IRBuilder<> Builder(&Sqrt);
2055 extractValues(Builder, SrcVals, SrcVal);
2056
2057 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2058 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2059 if (CanTreatAsDAZ)
2060 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2061 else
2062 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2063 }
2064
2065 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2066 NewSqrt->takeName(&Sqrt);
2067 Sqrt.replaceAllUsesWith(NewSqrt);
2068 Sqrt.eraseFromParent();
2069 return true;
2070}
2071
2072bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2073 if (skipFunction(F))
2074 return false;
2075
2076 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2077 if (!TPC)
2078 return false;
2079
2080 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2081 const TargetLibraryInfo *TLI =
2082 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2083 AssumptionCache *AC =
2084 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2085 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2086 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2087 const UniformityInfo &UA =
2088 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2089 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2090}
2091
2094 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2099 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2100 if (!Impl.run())
2101 return PreservedAnalyses::all();
2103 if (!Impl.FlowChanged)
2105 return PA;
2106}
2107
2108INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2109 "AMDGPU IR optimizations", false, false)
2113INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2115
2116char AMDGPUCodeGenPrepare::ID = 0;
2117
2119 return new AMDGPUCodeGenPrepare();
2120}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static bool tryNarrowMathIfNoOverflow(Instruction *I, const SITargetLowering *TLI, const TargetTransformInfo &TTI, const DataLayout &DL)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
Generic memory optimizations
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
support::ulittle16_t & Lo
Definition: aarch32.cpp:205
support::ulittle16_t & Hi
Definition: aarch32.cpp:204
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1158
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:431
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
BitVector & set()
Definition: BitVector.h:351
bool all() const
all - Returns true if all bits are set.
Definition: BitVector.h:175
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:612
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:535
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:277
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition: Constants.cpp:1121
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
void setFast(bool B=true)
Definition: FMF.h:96
bool noInfs() const
Definition: FMF.h:66
bool allowReciprocal() const
Definition: FMF.h:68
bool approxFunc() const
Definition: FMF.h:70
void setNoNaNs(bool B=true)
Definition: FMF.h:78
bool noNaNs() const
Definition: FMF.h:65
bool allowContract() const
Definition: FMF.h:69
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
Definition: Function.h:69
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1670
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2100
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2128
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2142
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2384
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:567
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1781
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:815
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition: IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1551
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:590
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2341
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1599
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1651
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1790
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2115
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2379
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2135
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:214
RetTy visitPHINode(PHINode &I)
Definition: InstVisitor.h:175
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
Definition: InstVisitor.h:189
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:256
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:190
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:275
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
Root of the metadata hierarchy.
Definition: Metadata.h:63
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:112
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
LLVM_ABI const fltSemantics & getFltSemantics() const
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:188
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Type * getElementType() const
Definition: DerivedTypes.h:463
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
Definition: PatternMatch.h:719
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:316
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:101
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:98
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
Definition: KnownFPClass.h:60